def main(): import argparse global model, spect_parser, decoder, args parser = argparse.ArgumentParser(description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) if args.cuda: model.cuda() model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(labels, blank_index=labels.index('_')) spect_parser = SpectrogramParser(audio_conf, normalize=True) logging.info('Server initialised') app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
def decode(self, probs, sizes=None, rescore=False): """ Decodes probability output using ctcdecode package. Arguments: probs: Tensor of character probabilities, where probs[c,t] is the probability of character c at time t sizes: Size of each sequence in the mini-batch Returns: string: sequences of the model's best guess for the transcription """ probs = probs.cpu() out, scores, offsets, seq_lens = self._decoder.decode(probs, sizes) strings = self.convert_to_strings(out, seq_lens) offsets = self.convert_tensor(offsets, seq_lens) if rescore: from rescoring import rescore_sent parser = argparse.ArgumentParser(description="DeepSpeech decoder") parser = add_inference_args(parser) parser.add_argument("--audio-path", default="audio.wav", help="Audio file to predict on") parser.add_argument( "--test-manifest", metavar="DIR", help="path to validation manifest csv", default="data/test_manifest.csv", ) parser.add_argument("--batch-size", default=20, type=int, help="Batch size for training") parser.add_argument("--num-workers", default=4, type=int, help="Number of workers used in dataloading") parser.add_argument( "--verbose", action="store_true", help="print out decoded output and error of each sample", ) parser.add_argument("--output-path", default=None, type=str, help="Where to save raw acoustic output") parser = add_decoder_args(parser) parser.add_argument("--save-output", action="store_true", help="Saves output of model from test") args = parser.parse_args() strings = rescore_sent(strings, args.rescore_lm) return strings, offsets
def main(): import argparse global model, spect_parser, decoder, args, device, decompressor parser = argparse.ArgumentParser( description='DeepSpeech transcription server') parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to be used by the server') parser.add_argument('--port', type=int, default=8888, help='Port to be used by the server') parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info('Setting up server...') torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index('_')) spect_parser = OnlineSpectrogramParser(model.audio_conf, normalize=True) logging.info('Server initialised') decompressor = LZString() server = WebsocketServer(host=args.host, port=args.port) server.set_fn_new_client(new_client) server.set_fn_client_left(client_left) server.set_fn_message_received(message_received) server.run_forever()
def main(): import argparse global model, spect_parser, decoder, args, device parser = argparse.ArgumentParser( description="DeepSpeech transcription server") parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to be used by the server") parser.add_argument("--port", type=int, default=8888, help="Port to be used by the server") parser = add_inference_args(parser) parser = add_decoder_args(parser) args = parser.parse_args() logging.getLogger().setLevel(logging.DEBUG) logging.info("Setting up server...") torch.set_grad_enabled(False) device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.cuda) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder( model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers, ) else: decoder = GreedyDecoder(model.labels, blank_index=model.labels.index("_")) spect_parser = SpectrogramParser(model.audio_conf, normalize=True) logging.info("Server initialised") app.run(host=args.host, port=args.port, debug=True, use_reloader=False)
help='Batch size for training') parser.add_argument('--num-workers', default=4, type=int, help='Number of workers used in dataloading') parser.add_argument('--verbose', action="store_true", help="print out decoded output and error of each sample") no_decoder_args = parser.add_argument_group( "No Decoder Options", "Configuration options for when no decoder is " "specified") no_decoder_args.add_argument('--output-path', default=None, type=str, help="Where to save raw acoustic output") parser = add_decoder_args(parser) args = parser.parse_args() if __name__ == '__main__': torch.set_grad_enabled(False) model = DeepSpeech.load_model(args.model_path) device = torch.device("cuda" if args.cuda else "cpu") model = model.to(device) model.eval() labels = DeepSpeech.get_labels(model) audio_conf = DeepSpeech.get_audio_conf(model) if args.decoder == "beam": from decoder import BeamCTCDecoder
from decoder import GreedyDecoder import torch from data.data_loader import SpectrogramParser from model import DeepSpeech import os.path import json parser = argparse.ArgumentParser(description='DeepSpeech transcription') parser = add_inference_args(parser) parser.add_argument('--audio-path', default='audio.wav', help='Audio file to predict on') parser.add_argument('--offsets', dest='offsets', action='store_true', help='Returns time offset information') parser = add_decoder_args(parser) args = parser.parse_args() def decode_results(model, decoded_output, decoded_offsets): results = { "output": [], "_meta": { "acoustic_model": { "name": os.path.basename(args.model_path) }, "language_model": { "name": os.path.basename(args.lm_path) if args.lm_path else None, }, "decoder": { "lm": args.lm_path is not None,
decoded_output, decoded_offsets = decoder.decode(out, output_sizes) return decoded_output, decoded_offsets if __name__ == '__main__': arg_parser = argparse.ArgumentParser( description='DeepSpeech transcription') arg_parser = add_inference_args(arg_parser) arg_parser.add_argument('--audio-path', default='audio.wav', help='Audio file to predict on') arg_parser.add_argument('--offsets', dest='offsets', action='store_true', help='Returns time offset information') arg_parser = add_decoder_args(arg_parser) args = arg_parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") model = load_model(device, args.model_path, args.half) if args.decoder == "beam": from decoder import BeamCTCDecoder decoder = BeamCTCDecoder(model.labels, lm_path=args.lm_path, alpha=args.alpha, beta=args.beta, cutoff_top_n=args.cutoff_top_n, cutoff_prob=args.cutoff_prob, beam_width=args.beam_width, num_processes=args.lm_workers)