def main(): parser = argparse.ArgumentParser(description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--alphabet', required=True, help='Path to the configuration file specifying the alphabet used by the network') parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument('--trie', nargs='?', help='Path to the language model trie file created with native_client/generate_trie') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format(args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.'.format(fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, fs))) else: print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def load_model(models, alphabet, lm, trie): N_FEATURES = 26 N_CONTEXT = 9 BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 model_load_start = timer() ds = Model(models, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start logging.debug("Loaded model in %0.3fs." % (model_load_end)) lm_load_start = timer() ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start logging.debug('Loaded language model in %0.3fs.' % (lm_load_end)) return [ds, model_load_end, lm_load_end]
def tflite_worker(model, alphabet, lm, trie, queue_in, queue_out, gpu_mask): os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_mask) ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH) ds.enableDecoderWithLM(alphabet, lm, trie, LM_ALPHA, LM_BETA) while True: msg = queue_in.get() fin = wave.open(msg['filename'], 'rb') fs = fin.getframerate() audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() decoded = ds.stt(audio, fs) queue_out.put({'prediction': decoded, 'ground_truth': msg['transcript']}) queue_in.task_done()
class Tester(BaseTester): name = 'DeepSpeech' audio_format = RATE16K_MONO_WAV def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) files = [args_lm, args_trie, args_model, args_alphabet] for f in files: assert os.path.isfile(f) print('Loading model from file %s' % (args_model), file=sys.stderr) model_load_start = timer() self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args_lm and args_trie: print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr) lm_load_start = timer() self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr) def audio_to_text(self, fn): fin = wave.open(fn, 'rb') fs = fin.getframerate() assert fs == 16000, "Only 16000Hz input WAV files are supported for now!" audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1/16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() text = self.ds.stt(audio, fs) print('text:', text) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return text
def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) files = [args_lm, args_trie, args_model, args_alphabet] for f in files: assert os.path.isfile(f) print('Loading model from file %s' % (args_model), file=sys.stderr) model_load_start = timer() self.ds = Model(args_model, N_FEATURES, N_CONTEXT, args_alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in %0.3fs.' % (model_load_end), file=sys.stderr) if args_lm and args_trie: print('Loading language model from files %s %s' % (args_lm, args_trie), file=sys.stderr) lm_load_start = timer() self.ds.enableDecoderWithLM(args_alphabet, args_lm, args_trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in %0.3fs.' % (lm_load_end), file=sys.stderr)
# The audio file we want to transcribe AUDIO_FILE = "test.wav" # Convert the model files to absolute paths (required by DeepSpeech) speech_model_path = str(CURRENT_FOLDER / SPEECH_MODEL) language_model_path = str(CURRENT_FOLDER / LANGUAGE_MODEL) language_model_trie_path = str(CURRENT_FOLDER / LANGUAGE_MODEL_TRIE) alphabet_config_path = str(CURRENT_FOLDER / ALPHABET_CONFIG) audio_file_path = str(CURRENT_FOLDER / AUDIO_FILE) # Load the pre-trained speech model deepspeech_model = DeepSpeechModel( speech_model_path, NUM_MFCC_FEATURES, NUM_SAMPLES_PER_WINDOW, alphabet_config_path, CTC_BEAM_WIDTH ) # Load the pre-trained language model deepspeech_model.enableDecoderWithLM( alphabet_config_path, language_model_path, language_model_trie_path, LANGUAGE_MODEL_WEIGHT, LANGUAGE_MODEL_WORD_INS_BONUS, ) # Load audio file using the wave library with wave.open(audio_file_path, 'rb') as input_wave_file:
class Speech2Text: def __init__(self, model_path='..\\models\\frozen_graphs\\output_graph.pbmm', beam_width=500): self.model_path = model_path self.beam_width = beam_width print('Loading model from file {}'.format(model_path), file=sys.stderr) model_load_start = timer() self.speech_recog_engine = Model(model_path, beam_width) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) self.desired_sample_rate = self.speech_recog_engine.sampleRate() @staticmethod def convert_samplerate(audio_path, desired_sample_rate): sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format( quote(audio_path), desired_sample_rate) try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError('SoX returned non-zero status: {}'.format( e.stderr)) except OSError as e: raise OSError( e.errno, 'SoX not found, use {}hz files or install it: {}'.format( desired_sample_rate, e.strerror)) return desired_sample_rate, np.frombuffer(output, np.int16) @staticmethod def metadata_to_string(metadata): return ''.join(item.character for item in metadata.items) def words_from_metadata(self, metadata): word = "" word_list = [] word_start_time = 0 # Loop through each character for i in range(0, metadata.num_items): item = metadata.items[i] # Append character to word if it's not a space if item.character != " ": word = word + item.character # Word boundary is either a space or the last character in the array if item.character == " " or i == metadata.num_items - 1: word_duration = item.start_time - word_start_time if word_duration < 0: word_duration = 0 each_word = dict() each_word["word"] = word each_word["start_time "] = round(word_start_time, 4) each_word["duration"] = round(word_duration, 4) word_list.append(each_word) # Reset word = "" word_start_time = 0 else: if len(word) == 1: # Log the start time of the new word word_start_time = item.start_time return word_list def load_audio_file(self, wav_audio_path): fin = wave.open(wav_audio_path, 'rb') frame_rate = fin.getframerate() if frame_rate != self.desired_sample_rate: print( 'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.' .format(frame_rate, self.desired_sample_rate), file=sys.stderr) fs, audio = Speech2Text.convert_samplerate( wav_audio_path, self.desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) nframes = fin.getnframes() fin.close() return audio, nframes, frame_rate @staticmethod def load_audio_file_static(wav_audio_path, desired_sample_rate): fin = wave.open(wav_audio_path, 'rb') frame_rate = fin.getframerate() if frame_rate != desired_sample_rate: print( 'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.' .format(frame_rate, desired_sample_rate), file=sys.stderr) fs, audio = Speech2Text.convert_samplerate(wav_audio_path, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) nframes = fin.getnframes() fin.close() return audio, nframes, frame_rate def convert_to_text(self, audio, nframes, frame_rate): audio_length = nframes * (1 / frame_rate) print('Running inference.', file=sys.stderr) inference_start = timer() text = self.speech_recog_engine.stt(audio) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr) return text
def main(): "Launch point" parser = argparse.ArgumentParser() parser.add_argument('--youtube-id', action="store", help="Provide youtube video ID") parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument( '--alphabet', required=True, help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument( '--trie', nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) parser.add_argument('--crop-time', type=int, help='You could process only n seconds.') args = parser.parse_args() file_name = download(args.youtube_id, crop_time=args.crop_time) print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() deepspeech = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format( args.lm, args.trie), file=sys.stderr) lm_load_start = timer() deepspeech.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(file_name, 'rb') framerate_sample = fin.getframerate() if framerate_sample != 16000: print( 'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.' .format(framerate_sample), file=sys.stderr) fin.close() return else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / 16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() result_sub = deepspeech.stt(audio, framerate_sample) result = " ".join( filter(lambda word: len(word) < 15, result_sub.split(" "))) f = open("result_text", 'w') f.write(result) f.close() print("Building top 20 keywords...") keyphrases = keywords.extract_keyphrases(result) print(keyphrases) print("Building summary sentence...") print(keywords.extract_summary_sentence(result)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
# SIU KING WAI SM4701 Deepstory # Please create a folder _test import numpy as np import csv import scipy.io.wavfile as wav from deepspeech import Model import os import sys import glob import pandas as pd ds = Model('deepspeech-0.7.0-models.pbmm') def transcribe(path='', array=None, silence=0.3): if array is not None: audio = array else: sr, audio = wav.read(path) audio = np.pad(audio, (int(sr * silence), int(sr * 0.5)), 'constant') return ds.stt(audio) def main(voice): audio_list = glob.glob(f'{voice}_test/*.wav') transcription = pd.DataFrame(audio_list, columns=['path']) transcription['filename'] = transcription['path'].str.split( '.', expand=True)[0].apply(lambda x: os.path.split(x)[1]) transcription['script'] = transcription['path'].apply(transcribe) transcription.drop(columns=['path'], inplace=True) transcription.to_csv(f'{voice}_transcription.csv', encoding='utf-8',
def main(): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument( '--alphabet', required=True, help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument( '--trie', nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, N_FEATURES, N_CONTEXT, args.alphabet, BEAM_WIDTH) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.lm and args.trie: print('Loading language model from files {} {}'.format( args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.alphabet, args.lm, args.trie, LM_ALPHA, LM_BETA) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != 16000: print( 'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.' .format(fs), file=sys.stderr) fs, audio = convert_samplerate(args.audio) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / 16000) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, fs))) else: print(ds.stt(audio, fs)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def main(): start = time.time() model_path = constants.fetch_contant('deepspeech', 'model_path') alphabet_path = constants.fetch_contant('deepspeech', 'alphabet_path') print('Loading model from file {}'.format(model_path), file=sys.stderr) ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH) print('Loaded accoustic model after: ' + str((time.time()) - start), file=sys.stderr) lm_path = constants.fetch_contant('deepspeech', 'lm_path') trie_path = constants.fetch_contant('deepspeech', 'trie_path') print('Loading language model from files {} {}'.format(lm_path, trie_path), file=sys.stderr) ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_ALPHA, LM_BETA) print('Loaded language model after: ' + str((time.time()) - start), file=sys.stderr) misc.reset_folders([CHUNK_PATH, TASK_PATH, CUST_PATH]) con = None try: con = psycopg2.connect( "host='192.168.0.102' dbname='sales' user='******' password='******'" ) cur = con.cursor() sql = 'delete from benchmark_deepspeech' cur.execute(sql) con.commit() print('Fetching tasks...') taskIds = fetchTaskIds() print('Fetched ' + str(len(taskIds)) + ' tasks!') for taskId in taskIds: print('Started ' + str(taskId)) try: downloadResult = downloadTaskAudio(str(taskId)) if downloadResult["success"]: customerPath = removeAgentChannel( downloadResult["abs_path"]) #customerPath = upsampleAudio(customerPath, 16000) snippets = performVAD(customerPath) for snippet in snippets: text = transcribe(snippet.path, ds) text = re.sub("'", "''", text) if len(text) > 0: url = 'http://192.168.0.100:5010/dschunks/' + re.sub( CHUNK_PATH, '', snippet.path) sql = "INSERT INTO public.benchmark_deepspeech (created_at, updated_at, audio_url, " sql += "audio_path, is_verified, ds_transcription, real_transcription, cer, wer, task_id, from_time," sql += " to_time) VALUES(now(), now(), '" + url + "', '" + snippet.path + "', false, '" + text + "', NULL, NULL, " sql += "NULL, " + str(taskId) + ", " + str( snippet.from_time) + ", " + str( snippet.to_time) + ");" print(sql) cur.execute(sql) con.commit() except Exception as e: print(e) print('Finished ' + str(taskId)) #break except psycopg2.DatabaseError as e: if con: con.rollback() print(e) sys.exit(1) finally: if con: con.close()
class InferenceThread(QObject): finished = Signal(str) def __init__(self): super(InferenceThread, self).__init__() self.in_queue = queue.Queue() self.should_quit = False self.worker = threading.Thread(target=self.run) def send_cmd(self, cmd): ''' Insert command in queue to be processed by the thread ''' self.in_queue.put(cmd) def setQuit(self): ''' Signal to the thread that it should stop running ''' self.should_quit = True def start(self): self.worker.start() def run(self): # Creating the model self.model = Model( os.path.join(os.path.dirname(__file__), "deepspeech-0.5.1-models/output_graph.pbmm"), N_FEATURES, N_CONTEXT, os.path.join(os.path.dirname(__file__), "deepspeech-0.5.1-models/alphabet.txt"), BEAM_WIDTH) self.model.enableDecoderWithLM( os.path.join(os.path.dirname(__file__), "deepspeech-0.5.1-models/alphabet.txt"), os.path.join(os.path.dirname(__file__), "deepspeech-0.5.1-models/lm.binary"), os.path.join(os.path.dirname(__file__), "deepspeech-0.5.1-models/trie"), LM_ALPHA, LM_BETA) stream = None while True: # Try to get the next command from our queue, use a timeout to check # periodically for a quit signal so the application doesn't hang on # exit. try: cmd, *data = self.in_queue.get(timeout=0.3) except queue.Empty: if self.should_quit: break # If we haven't received a quit signal just continue trying to # get a command from the queue indefinitely continue if cmd == "start": # "start" means create a new stream stream = self.model.setupStream() logging.debug("Starts to process sound") elif cmd == "data": # "data" means we received more audio data from the recorder if stream: self.model.feedAudioContent( stream, np.frombuffer(data[0].data(), np.int16)) elif cmd == "finish": # "finish" means the caller wants the result of the current stream transcript = self.model.finishStream(stream) self.finished.emit(transcript) stream = None logging.debug("Finishes to process sound")
def save_checkpoint(state, loss): """Save checkpoint if a new best is achieved""" fname = "checkpoint_" + time.strftime("%Y%m%d-%H%M%S") + "_" + str( loss.item()) + ".pth.tar" torch.save(state, get_rel_path(os.path.join(CHECKPOINTS_FOLDER, fname))) # save checkpoint print("$$$ Saved a new checkpoint\n") #####################Voice-To-Text############## #######Deepspeech Voice-To-Text Parameters######## DS_model_file_path = 'deepspeech_data/deepspeech-0.7.0-models.pbmm' beam_width = 500 DS_model = Model(DS_model_file_path) DS_model.setBeamWidth(beam_width) DS_model.enableExternalScorer('deepspeech_data/deepspeech-0.7.0-models.scorer') def get_text(data, model=DS_model): # y , s = librosa.load(fpath, sr=16000) y = (data * 32767).astype('int16') text = model.stt(y) return text def get_text_score(phrase1, phrase2): return SequenceMatcher(a=phrase1, b=phrase2).ratio()
import subprocess from pydub import AudioSegment try: from shhlex import quote except ImportError: from pipes import quote model_file_path = '/analyze/model/deepspeech-0.9.3-models.pbmm' lm_file_path = '/analyze/model/deepspeech-0.9.3-models.scorer' beam_width = 500 lm_alpha = 0.93 lm_beta = 1.18 model = Model(model_file_path) model.enableExternalScorer(lm_file_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam_width) def read_wav_file(filename): with wave.open(filename, 'rb') as w: rate = w.getframerate() frames = w.getnframes() buffer = w.readframes(frames) print(rate) print(frames) return buffer, rate
def main(): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument('--scorer', required=False, help='Path to the external scorer file') parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, help='Beam width for the CTC decoder') parser.add_argument( '--lm_alpha', type=float, help= 'Language model weight (lm_alpha). If not specified, use default from the scorer package.' ) parser.add_argument( '--lm_beta', type=float, help= 'Word insertion bonus (lm_beta). If not specified, use default from the scorer package.' ) parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') parser.add_argument( '--json', required=False, action='store_true', help='Output json from metadata with timestamp of each word') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(args.model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) if args.beam_width: ds.setModelBeamWidth(args.beam_width) desired_sample_rate = ds.sampleRate() if args.scorer: print('Loading scorer from files {}'.format(args.scorer), file=sys.stderr) scorer_load_start = timer() ds.enableExternalScorer(args.scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end), file=sys.stderr) if args.lm_alpha and args.lm_beta: ds.setScorerAlphaBeta(args.lm_alpha, args.lm_beta) fin = wave.open(args.audio, 'rb') fs_orig = fin.getframerate() if fs_orig != desired_sample_rate: print( 'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.' .format(fs_orig, desired_sample_rate), file=sys.stderr) fs_new, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs_orig) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() # sphinx-doc: python_ref_inference_start if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio, 1).transcripts[0])) elif args.json: print(metadata_json_output(ds.sttWithMetadata(audio, 3))) else: print(ds.stt(audio)) # sphinx-doc: python_ref_inference_stop inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
def __init__(self): self.model = Model(model_path=Path(__file__).parents[1].joinpath( 'model.pbmm').absolute().as_posix())
def init_deepspeech(self): self.ds_model = Model(ds_model_path, ds_features['beam_width']) self.ds_model.enableDecoderWithLM(ds_lm_path, ds_trie_path, ds_features['lm_alpha'], ds_features['lm_beta']) pass
class AugmentedSpeech: """ """ def __init__(self, runVerbose=False): self.ds_model = None self.osc_client = None self.verbose = runVerbose pass # setting up OSC subsystem def init_osc(self, host, port): self.osc_client = SimpleUDPClient(host, port) pass # Setting up deepspeech def init_deepspeech(self): self.ds_model = Model(ds_model_path, ds_features['beam_width']) self.ds_model.enableDecoderWithLM(ds_lm_path, ds_trie_path, ds_features['lm_alpha'], ds_features['lm_beta']) pass # processes a frame of the ODAS tracker def __process_odas_frame(self, buffer): # get dict of json buffer buffer_dict = json.loads(buffer) # parse src for v in buffer_dict['src']: # filter out inactive sources if v['activity'] < 0.5: continue pay_load = [] pay_load.append(buffer_dict['timeStamp']) pay_load.append(v['id']) pay_load.append(v['x']) pay_load.append(v['y']) pay_load.append(v['z']) pay_load.append(v['activity']) pay_load.append(v['tag']) self.osc_client.send_message('/source', pay_load) def run(self): print('ready ... ') buffer = "" # we pipe everything to the wrapper p = subprocess.Popen(odaslive_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) for line in iter(p.stdout.readline, b''): s = str(line, 'utf-8') # a frame can be identified by a closing curly bracket if s.find('}\n') == 0: buffer += s self.__process_odas_frame(buffer) buffer = "" else: buffer += s p.stdout.close() # pass back return code return p.wait()
class DeepspeechNode: # Two ways to pass in model: # model_path: path the pre-trained model # model: model itself # # dictionary: output from deepspeech is corrected to the closest word # in the given dictionary on a word-by-word basis # commands: complete output from deepspeech is corrected to the # closest phrase in the given commands def __init__(self, model=None, model_path=None, commands=None, dictionary=None): self.model = model if model_path != None: self.load_model(model_path) self.commands = commands self.dictionary = dictionary # Default values for n_feaures, n_context, beam_width are from # github.com/mozilla/DeepSpeech/blob/master/native_client/python/client.py def load_model(self, model_path, n_features=26, n_context=9, beam_width=500): model_path += "/" alphabet = model_path + "alphabet.txt" output_graph = model_path + "output_graph.pb" self.model = Model(output_graph, n_features, n_context, alphabet, beam_width) # See: en.wikipedia.org/wiki/Levenshtein_distance def levenshtein_distance(self, str1, str2): if len(str2) > len(str1): str2, str1 = str1, str2 row_count = len(str1) col_count = len(str2) row_current = range(row_count + 1) for row_n in xrange(row_count): row_next = [row_n + 1] for col_n in xrange(col_count): delete_cost = row_current[col_n + 1] + 1 insert_cost = row_next[col_n] + 1 substitution_cost = row_current[col_n] + 1 if str1[row_n] == str2[col_n]: substitution_cost = row_current[col_n] row_next.append( min([substitution_cost, delete_cost, insert_cost])) row_current = row_next return row_current[-1] def stt(self, fs, audio): assert self.model != None, "a model must be loaded before testing" transcription = self.model.stt(audio, fs) if self.dictionary != None: transcription_words = transcription.split(" ") new_transcription = "" for transcribed_word in transcription_words: distances = [ self.levenshtein_distance(transcribed_word, dict_word) for dict_word in self.dictionary ] min_dist_index = min(xrange(len(distances)), key=lambda x: distances[x]) word_guess = self.dictionary[min_dist_index] new_transcription += word_guess + " " transcription = new_transcription if self.commands != None: distances = [ self.levenshtein_distance(transcription, possibility) for possibility in self.commands ] min_dist_index = min(xrange(len(distances)), key=lambda x: distances[x]) transcription_guess = self.commands[min_dist_index] transcription = transcription_guess return transcription
def app_sst(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): webrtc_ctx = webrtc_streamer( key="speech-to-text", mode=WebRtcMode.SENDONLY, audio_receiver_size=1024, client_settings=ClientSettings( rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": False, "audio": True }, ), ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_receiver: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() try: audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1) except queue.Empty: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
class ModelWrapper(MAXModelWrapper): MODEL_NAME = 'MAX Speech to Text Converter' DEFAULT_MODEL_PATH = 'assets/models/output_graph.pbmm' MODEL_LICENSE = "MPL-2.0" MODEL_META_DATA = { 'id': '{}'.format(MODEL_NAME.lower().replace(' ', '-')), 'name': MODEL_NAME, 'description': 'Converts spoken words into text form.', 'type': 'Speech-To-Text Translation', 'license': MODEL_LICENSE, 'source': 'https://developer.ibm.com/exchanges/models/all/max-speech-to-text-converter/' } N_FEATURES = 26 # number of MFCC features N_CONTEXT = 9 # Size of the context window used for producing timesteps in the input vector BEAM_WIDTH = 500 # Beam width used in the CTC decoder when building candidate transcriptions LM_ALPHA = 0.75 # The alpha hyperparameter of the CTC decoder. Language Model weight LM_BETA = 1.85 # The beta hyperparameter of the CTC decoder. Word insertion bonus. alphabet_path = 'assets/models/alphabet.txt' lm_path = 'assets/models/lm.binary' trie_path = 'assets/models/trie' def __init__(self, path=DEFAULT_MODEL_PATH): logger.info('Loading model from: {}...'.format(path)) self.model = Model(path, self.N_FEATURES, self.N_CONTEXT, self.alphabet_path, self.BEAM_WIDTH) self.model.enableDecoderWithLM(self.alphabet_path, self.lm_path, self.trie_path, self.LM_ALPHA, self.LM_BETA) logger.info('Loaded model') def _convert_samplerate(self, audio_data, fs): resampled_audio = resample(np.frombuffer(audio_data, np.int16).astype(np.float32), fs, 16000) return 16000, resampled_audio.astype(np.int16) def _read_audio(self, audio_data): try: fin = wave.open(io.BytesIO(audio_data)) except (wave.Error, EOFError): raise OSError("Error reading the audio file. Only WAV files are supported.") if fin.getnchannels() != 1: raise OSError("Only mono audio files are supported.") fin_len = fin.getnframes() / fin.getframerate() # num frames / frame rate = length in seconds if fin_len > 10: raise OSError("This model is designed to work with short (about 5 second) audio files only.") return fin def _pre_process(self, audio_data): fin = self._read_audio(audio_data) fs = fin.getframerate() if fs != 16000: print('Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech ' 'recognition.'.format(fs), file=sys.stderr) fs, audio = self._convert_samplerate(audio_data, fs) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) fin.close() return audio def _post_process(self, preds): return preds def _predict(self, x): preds = self.model.stt(x, 16000) return preds
def app_sst_with_video(model_path: str, lm_path: str, lm_alpha: float, lm_beta: float, beam: int): class AudioProcessor(AudioProcessorBase): frames_lock: threading.Lock frames: deque def __init__(self) -> None: self.frames_lock = threading.Lock() self.frames = deque([]) async def recv_queued(self, frames: List[av.AudioFrame]) -> av.AudioFrame: with self.frames_lock: self.frames.extend(frames) # Return empty frames to be silent. new_frames = [] for frame in frames: input_array = frame.to_ndarray() new_frame = av.AudioFrame.from_ndarray( np.zeros(input_array.shape, dtype=input_array.dtype), layout=frame.layout.name, ) new_frame.sample_rate = frame.sample_rate new_frames.append(new_frame) return new_frames webrtc_ctx = webrtc_streamer( key="speech-to-text-w-video", mode=WebRtcMode.SENDRECV, audio_processor_factory=AudioProcessor, client_settings=ClientSettings( rtc_configuration={ "iceServers": [{ "urls": ["stun:stun.l.google.com:19302"] }] }, media_stream_constraints={ "video": True, "audio": True }, ), ) status_indicator = st.empty() if not webrtc_ctx.state.playing: return status_indicator.write("Loading...") text_output = st.empty() stream = None while True: if webrtc_ctx.audio_processor: if stream is None: from deepspeech import Model model = Model(model_path) model.enableExternalScorer(lm_path) model.setScorerAlphaBeta(lm_alpha, lm_beta) model.setBeamWidth(beam) stream = model.createStream() status_indicator.write("Model loaded.") sound_chunk = pydub.AudioSegment.empty() audio_frames = [] with webrtc_ctx.audio_processor.frames_lock: while len(webrtc_ctx.audio_processor.frames) > 0: frame = webrtc_ctx.audio_processor.frames.popleft() audio_frames.append(frame) if len(audio_frames) == 0: time.sleep(0.1) status_indicator.write("No frame arrived.") continue status_indicator.write("Running. Say something!") for audio_frame in audio_frames: sound = pydub.AudioSegment( data=audio_frame.to_ndarray().tobytes(), sample_width=audio_frame.format.bytes, frame_rate=audio_frame.sample_rate, channels=len(audio_frame.layout.channels), ) sound_chunk += sound if len(sound_chunk) > 0: sound_chunk = sound_chunk.set_channels(1).set_frame_rate( model.sampleRate()) buffer = np.array(sound_chunk.get_array_of_samples()) stream.feedAudioContent(buffer) text = stream.intermediateDecode() text_output.markdown(f"**Text:** {text}") else: status_indicator.write("AudioReciver is not set. Abort.") break
import unzip_requirements except ImportError: pass import shlex import subprocess import base64 import io import sys import wave import json import numpy as np from deepspeech import Model ds = Model('./model/deepspeech-0.7.1-models.pbmm') desired_sample_rate = ds.sampleRate() try: from shhlex import quote except ImportError: from pipes import quote def convert_samplerate(audio_path, desired_sample_rate): sox_cmd = 'sox {} --type raw --bits 16 --channels 1 --rate {} --encoding signed-integer --endian little --compression 0.0 --no-dither - '.format(quote(audio_path), desired_sample_rate) try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e:
def main(): parser = argparse.ArgumentParser( description='Running DeepSpeech inference.') parser.add_argument('--model', required=True, help='Path to the model (protocol buffer binary file)') parser.add_argument( '--alphabet', required=True, help= 'Path to the configuration file specifying the alphabet used by the network' ) parser.add_argument('--lm', nargs='?', help='Path to the language model binary file') parser.add_argument( '--trie', nargs='?', help= 'Path to the language model trie file created with native_client/generate_trie' ) parser.add_argument('--audio', required=True, help='Path to the audio file to run (WAV format)') parser.add_argument('--beam_width', type=int, default=500, help='Beam width for the CTC decoder') parser.add_argument('--lm_alpha', type=float, default=0.75, help='Language model weight (lm_alpha)') parser.add_argument('--lm_beta', type=float, default=1.85, help='Word insertion bonus (lm_beta)') parser.add_argument('--version', action=VersionAction, help='Print version and exits') parser.add_argument('--extended', required=False, action='store_true', help='Output string from extended metadata') args = parser.parse_args() print('Loading model from file {}'.format(args.model), file=sys.stderr) model_load_start = timer() ds = Model(args.model, args.alphabet, args.beam_width) model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end), file=sys.stderr) desired_sample_rate = ds.sampleRate() if args.lm and args.trie: print('Loading language model from files {} {}'.format( args.lm, args.trie), file=sys.stderr) lm_load_start = timer() ds.enableDecoderWithLM(args.lm, args.trie, args.lm_alpha, args.lm_beta) lm_load_end = timer() - lm_load_start print('Loaded language model in {:.3}s.'.format(lm_load_end), file=sys.stderr) fin = wave.open(args.audio, 'rb') fs = fin.getframerate() if fs != desired_sample_rate: print( 'Warning: original sample rate ({}) is different than {}hz. Resampling might produce erratic speech recognition.' .format(fs, desired_sample_rate), file=sys.stderr) fs, audio = convert_samplerate(args.audio, desired_sample_rate) else: audio = np.frombuffer(fin.readframes(fin.getnframes()), np.int16) audio_length = fin.getnframes() * (1 / fs) fin.close() print('Running inference.', file=sys.stderr) inference_start = timer() if args.extended: print(metadata_to_string(ds.sttWithMetadata(audio))) else: print(ds.stt(audio)) inference_end = timer() - inference_start print('Inference took %0.3fs for %0.3fs audio file.' % (inference_end, audio_length), file=sys.stderr)
return 16000, np.frombuffer(output, np.int16) LM_WEIGHT = 1.5 VALID_WORD_COUNT_WEIGHT = 2.25 N_FEATURES = 26 N_CONTEXT = 9 BEAM_WIDTH = 512 model = '/home/nihadern/voice_recog/models/output_graph.rounded.pbmm' micro = '/home/nihadern/voice_recog/male.wav' alpha = '/home/nihadern/voice_recog/models/alphabet.txt' trie = '/home/nihadern/voice_recog/models/trie' lm = '/home/nihadern/voice_recog/models/lm.binary' ds = Model(model, N_FEATURES, N_CONTEXT, alpha, BEAM_WIDTH) #model link, cepstrum, context ds.enableDecoderWithLM(alpha, lm, trie, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) print('\nModel ok') print('\nreading voice') fin = wave.open(micro, 'rb') fs = fin.getframerate() if fs != 16000: print( 'Warning: original sample rate ({}) is different than 16kHz. Resampling might produce erratic speech recognition.' .format(fs), file=sys.stderr) fs, audio = convert_samplerate(micro) else: audio = naudiop.frombuffer(fin.readframes(fin.getnframes()), np.int16)
dirname2 = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) import garbagecollector as gc sample_rate = 16000 beam_width = 500 lm_alpha = 0.75 lm_beta = 1.85 n_features = 26 n_context = 9 model_name = dirname2+"/models/output_graph.pbmm" alphabet = dirname2+"/models/alphabet.txt" langauage_model = dirname2+"/models/lm.binary" trie = dirname2+"/models/trie" ds = Model(model_name,beam_width) try: ds.enableDecoderWithLM(langauage_model, trie, lm_alpha, lm_beta) except: print("No language model and trie specified") def downsampleWav(src, dst, inrate=44100, outrate=sample_rate, inchannels=2, outchannels=1): if not os.path.exists(src): return ('Source not found!',False) if not os.path.exists(os.path.dirname(dst)): os.makedirs(os.path.dirname(dst)) try: s_read = wave.open(src, 'r') s_write = wave.open(dst, 'w')
from deepspeech import Model for i in range(5): ds = Model('/models/mozilla/deepspeech-0.7.3-models.pbmm') ds.enableExternalScorer('/models/mozilla/deepspeech-0.7.3-models.scorer') ds.setScorerAlphaBeta(0.75, 1.85) ds.__del__()
from flask import Flask from flask import jsonify from flask import request from flask_cors import CORS, cross_origin BEAM_WIDTH = 1024 LM_WEIGHT = 0.75 VALID_WORD_COUNT_WEIGHT = 1.85 N_FEATURES = 26 N_CONTEXT = 9 MODEL_FILE = 'models/output_graph.pbmm' ALPHABET_FILE = 'models/alphabet.txt' LANGUAGE_MODEL = 'models/lm.binary' TRIE_FILE = 'models/trie' ds = Model(MODEL_FILE, N_FEATURES, N_CONTEXT, ALPHABET_FILE, BEAM_WIDTH) ds.enableDecoderWithLM(ALPHABET_FILE, LANGUAGE_MODEL, TRIE_FILE, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) app = Flask(__name__) cors = CORS(app) app.config['CORS_HEADERS'] = 'Content-Type' @app.route('/', methods=['POST']) @cross_origin() def post(): fileName = 'file_'+str(uuid.uuid4())+'.wav' with open(fileName, "wb") as vid: vid.write(request.data) fs, audio = scipy.io.wavfile.read(fileName) processed_data = ds.stt(audio, fs)
try: output = subprocess.check_output(shlex.split(sox_cmd), stderr=subprocess.PIPE) except subprocess.CalledProcessError as e: raise RuntimeError('SoX returned non-zero status: {}'.format(e.stderr)) except OSError as e: raise OSError(e.errno, 'SoX not found, use {}hz files or install it: {}'.format(desired_sample_rate, e.strerror)) return desired_sample_rate, np.frombuffer(output, np.int16) print('Loading model from file {}'.format(model)) model_load_start = timer() # sphinx-doc: python_ref_model_start ds = Model(model) # sphinx-doc: python_ref_model_stop model_load_end = timer() - model_load_start print('Loaded model in {:.3}s.'.format(model_load_end)) if beam_width: ds.setBeamWidth(beam_width) desired_sample_rate = ds.sampleRate() if scorer: print('Loading scorer from files {}'.format(scorer)) scorer_load_start = timer() ds.enableExternalScorer(scorer) scorer_load_end = timer() - scorer_load_start print('Loaded scorer in {:.3}s.'.format(scorer_load_end))
# Number of MFCC features to use N_FEATURES = 26 # Size of the context window used for producing timesteps in the input vector N_CONTEXT = 9 # How aggressive to be when splitting audio files into chunks aggressiveness = 1 UPLOAD_FOLDER = '/tmp' ALLOWED_EXTENSIONS = set(['wav', 'mp3', 'flac']) app = Flask(__name__) app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER ds = Model('models/output_graph.pbmm', N_FEATURES, N_CONTEXT, 'models/alphabet.txt', BEAM_WIDTH) ds.enableDecoderWithLM('models/alphabet.txt', 'models/lm.binary', 'models/trie', LM_WEIGHT, VALID_WORD_COUNT_WEIGHT) api_keys = [] api_keyfile = 'api_keys.txt' transcription_in_progress = False print(transcription_in_progress) def load_keys(keylist): with open(keylist) as f: for line in f: credential = line.split(', ') api_keys.append(credential[0])
def main(): global line_count print("AutoSub v0.1\n") parser = argparse.ArgumentParser(description="AutoSub v0.1") parser.add_argument('--model', required=True, help='DeepSpeech model file') parser.add_argument('--scorer', help='DeepSpeech scorer file') parser.add_argument('--file', required=True, help='Input video file') args = parser.parse_args() ds_model = args.model if not ds_model.endswith(".pbmm"): print("Invalid model file. Exiting\n") exit(1) # Load DeepSpeech model ds = Model(ds_model) if args.scorer: ds_scorer = args.scorer if not ds_scorer.endswith(".scorer"): print( "Invalid scorer file. Running inference using only model file\n" ) else: ds.enableExternalScorer(ds_scorer) input_file = args.file print("\nInput file:", input_file) base_directory = os.getcwd() output_directory = os.path.join(base_directory, "output") audio_directory = os.path.join(base_directory, "audio") video_file_name = input_file.split("/")[-1].split(".")[0] audio_file_name = os.path.join(audio_directory, video_file_name + ".wav") srt_file_name = os.path.join(output_directory, video_file_name + ".srt") # Extract audio from input video file extract_audio(input_file, audio_file_name) print("Splitting on silent parts in audio file") silenceRemoval(audio_file_name) # Output SRT file file_handle = open(srt_file_name, "a+") print("\nRunning inference:") for file in tqdm(sort_alphanumeric(os.listdir(audio_directory))): audio_segment_path = os.path.join(audio_directory, file) # Dont run inference on the original audio file if audio_segment_path.split("/")[-1] != audio_file_name.split("/")[-1]: ds_process_audio(ds, audio_segment_path, file_handle) print("\nSRT file saved to", srt_file_name) file_handle.close() # Clean audio/ directory shutil.rmtree(audio_directory) os.mkdir(audio_directory)
class DetectAndDeter: CLASSIFICATION_COUNT = 5 TELEMARKETER_THRESH = 0.3 VALID_CALLER_THRESH = 0.1 IN_AUDIO_RATE = 8000 DS_AUDIO_RATE = 16000 MOZILLA_TTS_AUDIO_RATE = 22050 QUIET_THRESH = 150 QUIET_LENGTH = 3000 def __init__(self, name): self.name = name # user's name e.g. "Bob Ross" self.valid_caller_event = Event() self.caller_audio_chunk = np.array([], dtype='int16') self.audio_in_queue = Queue() self.stt_to_classification_queue = Queue() self.stt_to_chatbot_queue = Queue() self.chatbot_to_tts_queue = Queue() self.audio_out_queue = Queue() self.manager = Manager() self.transcript = self.manager.list() self.is_telemarketer = self.manager.Value("is_telemarketer", None) self.deep_speech = None self.mozilla_tts = None self.final_transcript = None self.final_predictions = None self.speech_to_text_thread = Process(target=self.speech_to_text) self.classify_text_thread = Process(target=self.classify_text) self.generate_response_thread = Process(target=self.generate_responses) self.text_to_speech_thread = Process(target=self.text_to_speech) self.log = { "start": None, "end": None, "version": CONFIG['version'], "transcript": [], "is_telemarketer": None, "caller": None } @property def queues(self): return self.audio_in_queue, self.audio_out_queue def start(self): self.speech_to_text_thread.start() self.classify_text_thread.start() self.generate_response_thread.start() self.text_to_speech_thread.start() self.log["start"] = dt.datetime.now().isoformat() def close(self): self.log["transcript"] = [value for value in self.transcript] self.log["is_telemarketer"] = self.is_telemarketer.value self.log["end"] = dt.datetime.now().isoformat() self.speech_to_text_thread.terminate() self.speech_to_text_thread.join() self.speech_to_text_thread.close() self.classify_text_thread.terminate() self.classify_text_thread.join() self.classify_text_thread.close() self.generate_response_thread.terminate() self.generate_response_thread.join() self.generate_response_thread.close() self.text_to_speech_thread.terminate() self.text_to_speech_thread.join() self.text_to_speech_thread.close() def fill_log_info(self, caller_number): self.log['caller'] = caller_number return self.log def classify_text(self): predictions = [] while self.is_telemarketer.value is None: idx = self.stt_to_classification_queue.get() text = self.transcript[idx]['text'] preds = model.predict(text) transcript_line = self.transcript[idx] transcript_line['analysis'] = { "prediction": str(preds[0]).lower(), "confidence": float(max(preds[2])) } self.transcript[idx] = transcript_line predictions.append(str(preds[0]).lower()) maybe_telemarketer = predictions.count("persuasion") / len( predictions) if len(predictions) > self.CLASSIFICATION_COUNT: print("CLASS") print(maybe_telemarketer, self.TELEMARKETER_THRESH, self.VALID_CALLER_THRESH) if maybe_telemarketer > self.TELEMARKETER_THRESH: self.is_telemarketer.value = True break elif maybe_telemarketer < self.VALID_CALLER_THRESH: self.is_telemarketer.value = False # self.is_telemarketer.set() break if not self.is_telemarketer.value: self.valid_caller_event.set() def generate_responses(self): while True: text = self.stt_to_chatbot_queue.get() print("Generate Response:", text) response = str(chatbot.get_response(text)) self.chatbot_to_tts_queue.put(response) def text_to_speech(self): tts_config = CONFIG['tts_config'] models_folder = Path(tts_config['folder']) model_path = str(models_folder / tts_config['model']) model_config_path = str(models_folder / tts_config['model_config']) vocoder_path = str(models_folder / tts_config['vocoder']) vocoder_config_path = str(models_folder / tts_config['vocoder_config']) self.mozilla_tts = Synthesizer(model_path, model_config_path, vocoder_path, vocoder_config_path) while True: response = self.chatbot_to_tts_queue.get() print("TTS:", response) sound_arr = np.array(self.mozilla_tts.tts(response)) sound_arr *= 2**15 sound_arr = sound_arr.astype('int16') sound = bytes(sound_arr) sound, _ = audioop.ratecv(sound, 2, 1, self.MOZILLA_TTS_AUDIO_RATE, self.IN_AUDIO_RATE, None) ulaw_sound = audioop.lin2ulaw(sound, 2) chunk_len = 540 chunks = len(ulaw_sound) // chunk_len extra = len(ulaw_sound) - (chunks * chunk_len) for c in range(chunks): chunk = ulaw_sound[c * chunk_len:c * chunk_len + chunk_len] self.audio_out_queue.put( base64.b64encode(chunk).decode('utf-8')) if extra != 0: chunk = ulaw_sound[-extra:] self.audio_out_queue.put( base64.b64encode(chunk).decode('utf-8')) self.transcript.append({ "speaker": "self", "text": response, "datetime": dt.datetime.now().isoformat() }) def speech_to_text(self): stt_config = CONFIG['stt_config'] models_folder = Path(stt_config['folder']) model_path = str(models_folder / stt_config['model']) scorer_path = str(models_folder / stt_config['scorer']) self.deep_speech = Model(model_path) self.deep_speech.enableExternalScorer(scorer_path) stream = self.deep_speech.createStream() while True: speech = self.audio_in_queue.get() while not self.audio_in_queue.empty(): speech += self.audio_in_queue.get() lin_speech = audioop.ulaw2lin(speech, 2) ds_speech, _ = audioop.ratecv(lin_speech, 2, 1, self.IN_AUDIO_RATE, self.DS_AUDIO_RATE, None) lin_speech_arr = np.frombuffer(lin_speech, np.int16) ds_speech_arr = np.frombuffer(ds_speech, np.int16) stream.feedAudioContent(ds_speech_arr) self.caller_audio_chunk = np.concatenate( (self.caller_audio_chunk, lin_speech_arr)) chunk_idx = max(0, len(self.caller_audio_chunk) - self.QUIET_LENGTH) quiet_chunk = self.caller_audio_chunk[chunk_idx:] if (quiet_chunk < self.QUIET_THRESH).all() and ( self.caller_audio_chunk > self.QUIET_THRESH).any(): text = stream.intermediateDecode() if text.strip(): self.stt_to_chatbot_queue.put(text) idx = len(self.transcript ) # insert to avoid race conditions with indexes self.transcript.insert( idx, { "speaker": "caller", "text": text, "datetime": dt.datetime.now().isoformat() }) self.stt_to_classification_queue.put(idx) stream.finishStream() stream = self.deep_speech.createStream() self.caller_audio_chunk = np.array([], dtype='int16') def make_greeting(self, one_party_consent): self.chatbot_to_tts_queue.put( f"Hi. This is {self.name} how may I help you?") if not one_party_consent: self.chatbot_to_tts_queue.put("Keep in mind, I record all calls")
parser = argparse.ArgumentParser(description=None) parser.add_argument('--input', type=str, dest="input", required=True, help="Input audio .wav file at 16KHz") args = parser.parse_args() while len(sys.argv) > 1: sys.argv.pop() BEAM_WIDTH = 500 LM_ALPHA = 0.75 LM_BETA = 1.85 N_FEATURES = 26 N_CONTEXT = 9 MODEL_PATH = 'deepspeech-0.4.1-checkpoint/models/output_graph.pb' ALPHABET_PATH = 'deepspeech-0.4.1-checkpoint/models/alphabet.txt' LM_PATH = 'deepspeech-0.4.1-checkpoint/models/lm.binary' TRIE_PATH = 'deepspeech-0.4.1-checkpoint/models/trie' ds = Model(MODEL_PATH, N_FEATURES, N_CONTEXT, ALPHABET_PATH, BEAM_WIDTH) ds.enableDecoderWithLM(ALPHABET_PATH, LM_PATH, TRIE_PATH, LM_ALPHA, LM_BETA) # Audio Path AUDIO_PATH = args.input # Read soundfile should say experience audio_data, sample_rate = soundfile.read(AUDIO_PATH, dtype='int16') print(audio_data) print(ds.stt(audio_data, sample_rate))
lastDetection = 0 startRecording = 0 endRecording = 0 newAudioDetected = False audio = pyaudio.PyAudio() #instantiate the pyaudio #recording prerequisites stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) #Carrega o modelo ds = Model(sys.argv[1], 500) #Inicia a gravacao frames = [] #inicia a deteccao newAudioDetected = False lastDetection = current_milli_time() while True: #Captura o audio data = stream.read(CHUNK) data_chunk = array('h', data) vol = max(data_chunk) #Deteccao de volume: if (vol >= 800):
def transcriber(inputdir, md, lm_alpha, lm_beta): toolbar_width = 40 inputdir_len = len(fnmatch.filter(os.listdir(path + inputdir), '*.wav')) print(inputdir_len) if (inputdir_len > toolbar_width): incriment = math.trunc(inputdir_len / toolbar_width) else: toolbar_width = inputdir_len incriment = 1 count = 1 outputList = [] BEAM_WIDTH = 500 deep = Model(md + '/output_graph.pbmm', BEAM_WIDTH) enabled = deep.enableDecoderWithLM(md + '/lm.binary', md + '/trie', lm_alpha, lm_beta) print('Decoder Enabled <0=true>:', enabled) # setup progress bar sys.stdout.write("[%s]" % (" " * toolbar_width)) sys.stdout.flush() sys.stdout.write("\b" * (toolbar_width + 1)) # return to start of line, after '[' filelist = sorted(os.listdir(path + inputdir)) # writes filelist to an output file with open( os.path.join(path + inputdir, inputdir.rsplit('/', 1)[1] + '_filelist.txt'), 'w') as writer: writer.writelines('\n'.join(filelist) + '\n') for filename in filelist: with open(os.path.join(path + inputdir, filename), 'r') as f: # open in readonly mode # if the file is not a wav file skip if filename.rsplit('.', 1)[1] != "wav": continue # Check if filename end in _mono.wav, if yes, skip the file if filename.rsplit('_', 1)[1] == "mono.wav": continue # Check if the file has an associated _mono.wav file in the directory if filename.rsplit( '.', 1)[0] + "_mono.wav" in os.listdir(path + inputdir): #if yes, prepare the _mono.wav file fs, audio = sound_utils.prepare_input( path + inputdir + "/" + filename.rsplit('.', 1)[0] + "_mono.wav") else: #if no, create an _mono.wav file and prepare that file sound_utils.stereo_to_mono(path + inputdir + "/" + filename) fs, audio = sound_utils.prepare_input( path + inputdir + "/" + filename.rsplit('.', 1)[0] + "_mono.wav") # run prepared audio through DeepSpeech result = deep.stt(audio) # remove generated processed file os.remove(path + inputdir + "/" + filename.rsplit('.', 1)[0] + "_mono.wav") # add the result to the outputList outputList.append(result + "\n") # progress bar incriment if (count == incriment): sys.stdout.write("-") sys.stdout.flush() count = 0 count += 1 sys.stdout.write("]\n") # this ends the progress bar # writes results to an output file with open( os.path.join(path + inputdir, inputdir.rsplit('/', 1)[1] + '_output.txt'), 'w') as writer: writer.writelines(outputList) return