def test_SequentialRNNLM_beam_search(rnn_type, tie_weights, dtype): token_list = ["<blank>", "a", "b", "c", "unk", "<eos>"] vocab_size = len(token_list) model = SequentialRNNLM( vocab_size, nlayers=2, rnn_type=rnn_type, tie_weights=tie_weights ) beam = BeamSearch( beam_size=3, vocab_size=vocab_size, weights={"test": 1.0}, scorers={"test": model}, token_list=token_list, sos=vocab_size - 1, eos=vocab_size - 1, pre_beam_score_key=None, ) beam.to(dtype=dtype) enc = torch.randn(10, 20).type(dtype) with torch.no_grad(): beam( x=enc, maxlenratio=0.0, minlenratio=0.0, )
def test_TransformerDecoder_beam_search(input_layer, normalize_before, use_output_layer, dtype, decoder_class): token_list = ["<blank>", "a", "b", "c", "unk", "<eos>"] vocab_size = len(token_list) encoder_output_size = 4 decoder = decoder_class( vocab_size=vocab_size, encoder_output_size=encoder_output_size, input_layer=input_layer, normalize_before=normalize_before, use_output_layer=use_output_layer, linear_units=10, ) beam = BeamSearch( beam_size=3, vocab_size=vocab_size, weights={"test": 1.0}, scorers={"test": decoder}, token_list=token_list, sos=vocab_size - 1, eos=vocab_size - 1, pre_beam_score_key=None, ) beam.to(dtype=dtype) enc = torch.randn(10, encoder_output_size).type(dtype) with torch.no_grad(): beam( x=enc, maxlenratio=0.0, minlenratio=0.0, )
def test_RNNDecoder_beam_search(context_residual, rnn_type, dtype): token_list = ["<blank>", "a", "b", "c", "unk", "<eos>"] vocab_size = len(token_list) encoder_output_size = 4 decoder = RNNDecoder( vocab_size, encoder_output_size=encoder_output_size, context_residual=context_residual, rnn_type=rnn_type, ) beam = BeamSearch( beam_size=3, vocab_size=vocab_size, weights={"test": 1.0}, scorers={"test": decoder}, token_list=token_list, sos=vocab_size - 1, eos=vocab_size - 1, pre_beam_score_key=None, ) beam.to(dtype=dtype) enc = torch.randn(10, encoder_output_size).type(dtype) with torch.no_grad(): beam( x=enc, maxlenratio=0.0, minlenratio=0.0, )
def build_beam_search(self, ctc_weight: float = 0.4, beam_size: int = 1): """Constroi o objeto de decodificação beam_search. Esse objeto faz a decodificação do vetor de embeddings da saída da parte encoder do modelo passando pelos decoders da rede que são o módulo CTC e Transformer ou RNN. Como: Loss = (1-λ)*DecoderLoss + λ*CTCLoss Se ctc_weight=1 apenas o módulo CTC será usado na decodificação Args: ctc_weight (float, optional): Peso dado ao módulo CTC da rede. Defaults to 0.4. beam_size (int, optional): Tamanho do feixe de busca durante a codificação. Defaults to 1. """ scorers = {} ctc = CTCPrefixScorer(ctc=self.model.ctc, eos=self.model.eos) token_list = self.model.token_list scorers.update( decoder=self.model.decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) #Variáveis com os pesos para cada parte da decodificação #lm referente à modelos de linguagem não são utilizados aqui mas são necessários no objeto weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=1.0, length_bonus=0.0, ) #Cria o objeto beam_search self.beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=self.model.sos, eos=self.model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) self.beam_search.to(device=self.device, dtype=getattr(torch, 'float32')).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=self.device, dtype=getattr(torch, 'float32')).eval()
def __init__(self, model_path: str, lm_path: str): super(TraceModel, self).__init__() self.model, self.train_args = load_trained_model(model_path=model_path) logging.info(self.model) assert isinstance(self.model, ASRInterface) self.model.eval() self.recog_args = self.__get_recog_args() self.rnnlm = self.__make_lm_module(lm_path=lm_path) scorers = self.model.scorers() scorers["lm"] = self.rnnlm scorers["length_bonus"] = LengthBonus(len(self.train_args.char_list)) weights = dict( decoder=1.0 - self.recog_args.ctc_weight, ctc=self.recog_args.ctc_weight, lm=self.recog_args.lm_weight, length_bonus=self.recog_args.penalty, ) self.beam_search = BeamSearch( beam_size=self.recog_args.beam_size, vocab_size=len(self.train_args.char_list), weights=weights, scorers=scorers, sos=self.model.sos, eos=self.model.eos, token_list=self.train_args.char_list, pre_beam_score_key=None if self.recog_args.ctc_weight == 1.0 else "decoder", )
def test_TransformerLM_beam_search(pos_enc, dtype): token_list = ["<blank>", "a", "b", "c", "unk", "<eos>"] vocab_size = len(token_list) model = TransformerLM(vocab_size, pos_enc=pos_enc, unit=10) beam = BeamSearch( beam_size=3, vocab_size=vocab_size, weights={"test": 1.0}, scorers={"test": model}, token_list=token_list, sos=vocab_size - 1, eos=vocab_size - 1, pre_beam_score_key=None, ) beam.to(dtype=dtype) enc = torch.randn(10, 20).type(dtype) with torch.no_grad(): beam( x=enc, maxlenratio=0.0, minlenratio=0.0, )
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], blank_symbol: str, token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 3. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 4. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(asr_train_args, False), collate_fn=ASRTask.build_collate_fn(asr_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 6. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Encoder enc, _ = asr_model.encode(**batch) assert len(enc) == batch_size, len(enc) # c. Passed the encoder result and the beam search nbest_hyps = beam_search(x=enc[0], maxlenratio=maxlenratio, minlenratio=minlenratio) nbest_hyps = nbest_hyps[:nbest] # Only supporting batch_size==1 key = keys[0] for n in range(1, nbest + 1): hyp = nbest_hyps[n - 1] assert isinstance(hyp, Hypothesis), type(hyp) # remove sos/eos and get results token_int = hyp.yseq[1:-1].tolist() # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0, token_int)) # Change integer-ids to tokens token = converter.ids2tokens(token_int) # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each files ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if tokenizer is not None: text = tokenizer.tokens2text(token) ibest_writer["text"][key] = text
def recog_v2(args): """Decode with custom models that implements ScorerInterface. Notes: The previous backend espnet.asr.pytorch_backend.asr.recog only supports E2E and RNNLM Args: args (namespace): The program arguments. See py:func:`espnet.bin.asr_recog.get_parser` for details """ logging.warning("experimental API for custom LMs is selected by --api v2") if args.batchsize > 1: raise NotImplementedError("multi-utt batch decoding is not implemented") if args.streaming_mode is not None: raise NotImplementedError("streaming mode is not implemented") if args.word_rnnlm: raise NotImplementedError("word LM is not implemented") set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) if args.quantize_config is not None: q_config = set([getattr(torch.nn, q) for q in args.quantize_config]) else: q_config = {torch.nn.Linear} if args.quantize_asr_model: logging.info("Use quantized asr model for decoding") # See https://github.com/espnet/espnet/pull/3616 for more information. if ( torch.__version__ < LooseVersion("1.4.0") and "lstm" in train_args.etype and torch.nn.LSTM in q_config ): raise ValueError( "Quantized LSTM in ESPnet is only supported with torch 1.4+." ) if args.quantize_dtype == "float16" and torch.__version__ < LooseVersion( "1.5.0" ): raise ValueError( "float16 dtype for dynamic quantization is not supported with torch " "version < 1.5.0. Switching to qint8 dtype instead." ) dtype = getattr(torch, args.quantize_dtype) model = torch.quantization.quantize_dynamic(model, q_config, dtype=dtype) model.eval() load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) if args.rnnlm: lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) # NOTE: for a compatibility with less than 0.5.0 version models lm_model_module = getattr(lm_args, "model_module", "default") lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) lm = lm_class(len(train_args.char_list), lm_args) torch_load(args.rnnlm, lm) if args.quantize_lm_model: logging.info("Use quantized lm model") dtype = getattr(torch, args.quantize_dtype) lm = torch.quantization.quantize_dynamic(lm, q_config, dtype=dtype) lm.eval() else: lm = None if args.ngram_model: from espnet.nets.scorers.ngram import NgramFullScorer from espnet.nets.scorers.ngram import NgramPartScorer if args.ngram_scorer == "full": ngram = NgramFullScorer(args.ngram_model, train_args.char_list) else: ngram = NgramPartScorer(args.ngram_model, train_args.char_list) else: ngram = None scorers = model.scorers() scorers["lm"] = lm scorers["ngram"] = ngram scorers["length_bonus"] = LengthBonus(len(train_args.char_list)) weights = dict( decoder=1.0 - args.ctc_weight, ctc=args.ctc_weight, lm=args.lm_weight, ngram=args.ngram_weight, length_bonus=args.penalty, ) beam_search = BeamSearch( beam_size=args.beam_size, vocab_size=len(train_args.char_list), weights=weights, scorers=scorers, sos=model.sos, eos=model.eos, token_list=train_args.char_list, pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if args.batchsize == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning( f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation." ) if args.ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") if args.ngpu == 1: device = "cuda" else: device = "cpu" dtype = getattr(torch, args.dtype) logging.info(f"Decoding device={device}, dtype={dtype}") model.to(device=device, dtype=dtype).eval() beam_search.to(device=device, dtype=dtype).eval() # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] enc = model.encode(torch.as_tensor(feat).to(device=device, dtype=dtype)) nbest_hyps = beam_search( x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio ) nbest_hyps = [ h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)] ] new_js[name] = add_results_to_json( js[name], nbest_hyps, train_args.char_list ) with open(args.result_label, "wb") as f: f.write( json.dumps( {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") )
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, streaming: bool = False, ): assert check_argument_types() # 1. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.to(dtype=getattr(torch, dtype)).eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 3. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: if streaming: beam_search.__class__ = BatchBeamSearchOnlineSim beam_search.set_streaming_config(asr_train_config) logging.info( "BatchBeamSearchOnlineSim implementation is selected.") else: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning(f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
model """## 4.3 Recognize the speech by the model You can perform joint decoding with all the models (S2S, CTC, LM, etc) in ESPnet """ import re from espnet.nets.beam_search import BeamSearch key, info = list(test_json.items())[10] fbank = kaldiio.load_mat(info["input"][0]["feat"]) # setup beam search bs = BeamSearch( scorers=model.scorers(), weights={"decoder": 0.5, "ctc": 0.5}, sos=model.sos, eos=model.eos, beam_size=2, vocab_size=len(vocab)) # GPU decoding: model.cuda(), bs.cuda() with torch.no_grad(): encoded = model.encode(torch.as_tensor(fbank)) result = bs(encoded) # get N-best results print("groundtruth:", info["output"][0]["text"]) print("N-best list:") for n, hyp in enumerate(result, 1): text = "".join(vocab[y] for y in hyp.yseq).replace("<space>", " ").replace("<eos>", "") scores = {k: f"{float(v):0.3f}" for k, v in hyp.scores.items()} print(f"{n}: {text}, score: {scores}") """## 4.4 Visualizations
def __init__( self, asr_train_config: Union[Path, str] = None, asr_model_file: Union[Path, str] = None, transducer_conf: dict = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, ngram_scorer: str = "full", ngram_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, ngram_weight: float = 0.9, penalty: float = 0.0, nbest: int = 1, streaming: bool = False, enh_s2t_task: bool = False, quantize_asr_model: bool = False, quantize_lm: bool = False, quantize_modules: List[str] = ["Linear"], quantize_dtype: str = "qint8", ): assert check_argument_types() task = ASRTask if not enh_s2t_task else EnhS2TTask if quantize_asr_model or quantize_lm: if quantize_dtype == "float16" and torch.__version__ < LooseVersion( "1.5.0"): raise ValueError( "float16 dtype for dynamic quantization is not supported with " "torch version < 1.5.0. Switch to qint8 dtype instead.") quantize_modules = set( [getattr(torch.nn, q) for q in quantize_modules]) quantize_dtype = getattr(torch, quantize_dtype) # 1. Build ASR model scorers = {} asr_model, asr_train_args = task.build_model_from_file( asr_train_config, asr_model_file, device) if enh_s2t_task: asr_model.inherite_attributes(inherite_s2t_attrs=[ "ctc", "decoder", "eos", "joint_network", "sos", "token_list", "use_transducer_decoder", ]) asr_model.to(dtype=getattr(torch, dtype)).eval() if quantize_asr_model: logging.info("Use quantized asr model for decoding.") asr_model = torch.quantization.quantize_dynamic( asr_model, qconfig_spec=quantize_modules, dtype=quantize_dtype) decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) if quantize_lm: logging.info("Use quantized lm for decoding.") lm = torch.quantization.quantize_dynamic( lm, qconfig_spec=quantize_modules, dtype=quantize_dtype) scorers["lm"] = lm.lm # 3. Build ngram model if ngram_file is not None: if ngram_scorer == "full": from espnet.nets.scorers.ngram import NgramFullScorer ngram = NgramFullScorer(ngram_file, token_list) else: from espnet.nets.scorers.ngram import NgramPartScorer ngram = NgramPartScorer(ngram_file, token_list) else: ngram = None scorers["ngram"] = ngram # 4. Build BeamSearch object if asr_model.use_transducer_decoder: beam_search_transducer = BeamSearchTransducer( decoder=asr_model.decoder, joint_network=asr_model.joint_network, beam_size=beam_size, lm=scorers["lm"] if "lm" in scorers else None, lm_weight=lm_weight, **transducer_conf, ) beam_search = None else: beam_search_transducer = None weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, ngram=ngram_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: if streaming: beam_search.__class__ = BatchBeamSearchOnlineSim beam_search.set_streaming_config(asr_train_config) logging.info( "BatchBeamSearchOnlineSim implementation is selected." ) else: beam_search.__class__ = BatchBeamSearch logging.info( "BatchBeamSearch implementation is selected.") else: logging.warning( f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.beam_search_transducer = beam_search_transducer self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
class ASR(object): def __init__( self, zip_model_file: Union[Path, str], ) -> None: self.zip_model_file = abspath(zip_model_file) self.device = 'cpu' self.model = None self.beam_search = None self.tokenizer = None self.converter = None self.global_cmvn = None self.extract_zip_model_file(self.zip_model_file) def extract_zip_model_file(self, zip_model_file: str) -> Dict[str, Any]: """Extrai os dados de um zip contendo o arquivo com o estado do modelo e configurações Args: zip_model_file (str): ZipFile do modelo gerado dos scripts de treinamento Raises: ValueError: Se o arquivo não for correto FileNotFoundError: Se o arquivo zip não contiver os arquivos necessários Returns: Dict[str, Any]: Dicionário do arquivo .yaml utilizado durante o treinamento para carregar o modelo corretamente """ print("Unzipping model") if not zipfile.is_zipfile(zip_model_file): raise ValueError(f"File {zip_model_file} is not a zipfile") else: zipfile.ZipFile(zip_model_file).extractall(dirname(zip_model_file)) check = ['exp', 'meta.yaml'] if not all([x for x in check]): raise FileNotFoundError print("Load yaml file") with open('meta.yaml') as f: meta = yaml.load(f, Loader=yaml.FullLoader) model_stats_file = meta['files']['asr_model_file'] asr_model_config_file = meta['yaml_files']['asr_train_config'] self.model_config = {} with open(asr_model_config_file) as f: self.model_config = yaml.load(f, Loader=yaml.FullLoader) try: self.global_cmvn = self.model_config['normalize_conf'][ 'stats_file'] except KeyError: self.global_cmvn = None print(f'Loading model config from {asr_model_config_file}') print(f'Loading model state from {model_stats_file}') #Build Model print('Building model') self.model, _ = ASRTask.build_model_from_file(asr_model_config_file, model_stats_file, self.device) self.model.to(dtype=getattr(torch, 'float32')).eval() #print("Loading extra modules") self.build_beam_search() self.build_tokenizer() def build_beam_search(self, ctc_weight: float = 0.4, beam_size: int = 1): """Constroi o objeto de decodificação beam_search. Esse objeto faz a decodificação do vetor de embeddings da saída da parte encoder do modelo passando pelos decoders da rede que são o módulo CTC e Transformer ou RNN. Como: Loss = (1-λ)*DecoderLoss + λ*CTCLoss Se ctc_weight=1 apenas o módulo CTC será usado na decodificação Args: ctc_weight (float, optional): Peso dado ao módulo CTC da rede. Defaults to 0.4. beam_size (int, optional): Tamanho do feixe de busca durante a codificação. Defaults to 1. """ scorers = {} ctc = CTCPrefixScorer(ctc=self.model.ctc, eos=self.model.eos) token_list = self.model.token_list scorers.update( decoder=self.model.decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) #Variáveis com os pesos para cada parte da decodificação #lm referente à modelos de linguagem não são utilizados aqui mas são necessários no objeto weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=1.0, length_bonus=0.0, ) #Cria o objeto beam_search self.beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=self.model.sos, eos=self.model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) self.beam_search.to(device=self.device, dtype=getattr(torch, 'float32')).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=self.device, dtype=getattr(torch, 'float32')).eval() def build_tokenizer(self): """Cria um objeto tokenizer para conversão dos tokens inteiros para o dicionário de caracteres correspondente. Caso o modelo possua um modelo BPE de tokenização, ele é utilizado. Se não, apenas a lista de caracteres no arquivo de configuração é usada. """ token_type = self.model_config['token_type'] if token_type == 'bpe': bpemodel = self.model_config['bpemodel'] self.tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: self.tokenizer = build_tokenizer(token_type=token_type) self.converter = TokenIDConverter(token_list=self.model.token_list) def get_layers(self) -> Dict[str, Dict[str, torch.Size]]: """Retorna as camadas nomeadas e os respectivos shapes para todos os módulos da rede. Os módulos são: Encoder: RNN, VGGRNN, TransformerEncoder Decoder: RNN, TransformerDecoder CTC Returns: Dict[str, Dict[str, torch.Size]]: Dicionário de cada módulo com seus respectivos layers e shape """ r = {} r['frontend'] = { x: self.model.frontend.state_dict()[x].shape for x in self.model.frontend.state_dict().keys() } r['specaug'] = { x: self.model.specaug.state_dict()[x].shape for x in self.model.specaug.state_dict().keys() } r['normalize'] = { x: self.model.normalize.state_dict()[x].shape for x in self.model.normalize.state_dict().keys() } r['encoder'] = { x: self.model.encoder.state_dict()[x].shape for x in self.model.encoder.state_dict().keys() } r['decoder'] = { x: self.model.decoder.state_dict()[x].shape for x in self.model.decoder.state_dict().keys() } r['ctc'] = { x: self.model.ctc.state_dict()[x].shape for x in self.model.ctc.state_dict().keys() } return r def frontend(self, audiofile: Union[Path, str, bytes], normalize: bool = True) -> Tuple[torch.Tensor, torch.Tensor]: """Executa o frontend do modelo, transformando as amostras de áudio em parâmetros log mel spectrogram Args: audiofile (Union[Path, str]): arquivo de áudio Returns: Tuple[torch.Tensor, torch.Tensor]: Parâmetros, Tamanho do vetor de parâmetros """ if isinstance(audiofile, str): audio_samples, rate = librosa.load(audiofile, sr=16000) elif isinstance(audiofile, bytes): audio_samples, rate = librosa.core.load(io.BytesIO(audiofile), sr=16000) else: raise ValueError("Failed to load audio file") if isinstance(audio_samples, np.ndarray): audio_samples = torch.tensor(audio_samples) audio_samples = audio_samples.unsqueeze(0).to(getattr( torch, 'float32')) lengths = audio_samples.new_full([1], dtype=torch.long, fill_value=audio_samples.size(1)) features, features_length = self.model.frontend(audio_samples, lengths) if normalize: features, features_length = self.model.normalize( features, features_length) return features, features_length def specaug( self, features: torch.Tensor, features_length: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: """Executa o módulo specaug, da parte de 'data augmentation'. Útil para visualização apenas. Não é utilizado na inferência, apenas no treinamento. Args: features (torch.Tensor): Parâmetros features_length (torch.Tensor): tamanho do vetor de parâmetros Returns: Tuple[torch.Tensor, torch.Tensor]: Parâmetros com máscaras temporais, em frequência e distoção. Tamanho dos vetores """ return self.model.specaug(features, features_length) def __del__(self) -> None: """Remove os arquivos temporários """ for f in ['exp', 'meta.yaml']: print(f"Removing {f}") ff = join(dirname(self.zip_model_file), f) if exists(ff): if isdir(ff): shutil.rmtree(ff) elif isfile(ff): os.remove(ff) else: raise ValueError("Error ao remover arquivos temporários") @torch.no_grad() def recognize(self, audiofile: Union[Path, str, bytes]) -> Result: result = Result() if isinstance(audiofile, str): audio_samples, rate = librosa.load(audiofile, sr=16000) elif isinstance(audiofile, bytes): audio_samples, rate = librosa.core.load(io.BytesIO(audiofile), sr=16000) else: raise ValueError("Failed to load audio file") result.audio_samples = copy.deepcopy(audio_samples) #a entrada do modelo é torch.tensor if isinstance(audio_samples, np.ndarray): audio_samples = torch.tensor(audio_samples) audio_samples = audio_samples.unsqueeze(0).to(getattr( torch, 'float32')) lengths = audio_samples.new_full([1], dtype=torch.long, fill_value=audio_samples.size(1)) batch = {"speech": audio_samples, "speech_lengths": lengths} batch = to_device(batch, device=self.device) #model encoder enc, _ = self.model.encode(**batch) #model decoder nbest_hyps = self.beam_search(x=enc[0]) #Apenas a melhor hipótese best_hyps = nbest_hyps[0] #Conversão de tokenids do treinamento para texto token_int = best_hyps.yseq[1:-1].tolist() token_int = list(filter(lambda x: x != 0, token_int)) token = self.converter.ids2tokens(token_int) text = self.tokenizer.tokens2text(token) #Preenche o objeto result result.text = text result.encoded_vector = enc[0] #[0] remove dimensão de batch #calcula todas as matrizes de atenção # text_tensor = torch.Tensor(token_int).unsqueeze(0).to( getattr(torch, 'long')) batch["text"] = text_tensor batch["text_lengths"] = text_tensor.new_full( [1], dtype=torch.long, fill_value=text_tensor.size(1)) result.attention_weights = calculate_all_attentions(self.model, batch) result.tokens_txt = token #CTC posteriors logp = self.model.ctc.log_softmax(enc.unsqueeze(0))[0] result.ctc_posteriors = logp.exp_().numpy() result.tokens_int = best_hyps.yseq result.mel_features, _ = self.frontend(audiofile, normalize=False) return result def __call__(self, input: Union[Path, str, bytes]) -> Result: return self.recognize(input)
def __init__( self, mt_train_config: Union[Path, str] = None, mt_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, ngram_scorer: str = "full", ngram_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, lm_weight: float = 1.0, ngram_weight: float = 0.9, penalty: float = 0.0, nbest: int = 1, ): assert check_argument_types() # 1. Build MT model scorers = {} mt_model, mt_train_args = MTTask.build_model_from_file( mt_train_config, mt_model_file, device) mt_model.to(dtype=getattr(torch, dtype)).eval() decoder = mt_model.decoder token_list = mt_model.token_list scorers.update( decoder=decoder, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 3. Build ngram model if ngram_file is not None: if ngram_scorer == "full": from espnet.nets.scorers.ngram import NgramFullScorer ngram = NgramFullScorer(ngram_file, token_list) else: from espnet.nets.scorers.ngram import NgramPartScorer ngram = NgramPartScorer(ngram_file, token_list) else: ngram = None scorers["ngram"] = ngram # 4. Build BeamSearch object weights = dict( decoder=1.0, lm=lm_weight, ngram=ngram_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=mt_model.sos, eos=mt_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key="full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning(f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = mt_train_args.token_type if bpemodel is None: bpemodel = mt_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.mt_model = mt_model self.mt_train_args = mt_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def recog_v2(args): """Decode with custom models that implements ScorerInterface. Notes: The previous backend espnet.asr.pytorch_backend.asr.recog only supports E2E and RNNLM Args: args (namespace): The program arguments. See py:func:`espnet.bin.asr_recog.get_parser` for details """ logging.warning("experimental API for custom LMs is selected by --api v2") if args.batchsize > 1: raise NotImplementedError("batch decoding is not implemented") if args.streaming_mode is not None: raise NotImplementedError("streaming mode is not implemented") if args.word_rnnlm: raise NotImplementedError("word LM is not implemented") set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.eval() load_inputs_and_targets = LoadInputsAndTargets( mode='asr', load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={'train': False}) if args.rnnlm: lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) # NOTE: for a compatibility with less than 0.5.0 version models lm_model_module = getattr(lm_args, "model_module", "default") lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) lm = lm_class(len(train_args.char_list), lm_args) torch_load(args.rnnlm, lm) lm.eval() else: lm = None scorers = model.scorers() scorers["lm"] = lm scorers["length_bonus"] = LengthBonus(len(train_args.char_list)) weights = dict(decoder=1.0 - args.ctc_weight, ctc=args.ctc_weight, lm=args.lm_weight, length_bonus=args.penalty) beam_search = BeamSearch( beam_size=args.beam_size, vocab_size=len(train_args.char_list), weights=weights, scorers=scorers, sos=model.sos, eos=model.eos, token_list=train_args.char_list, ) if args.ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") if args.ngpu == 1: device = "cuda" else: device = "cpu" dtype = getattr(torch, args.dtype) logging.info(f"Decoding device={device}, dtype={dtype}") model.to(device=device, dtype=dtype).eval() beam_search.to(device=device, dtype=dtype).eval() # read json data with open(args.recog_json, 'rb') as f: js = json.load(f)['utts'] new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info('(%d/%d) decoding ' + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] enc = model.encode( torch.as_tensor(feat).to(device=device, dtype=dtype)) print(enc.shape) print(model) nbest_hyps = beam_search(x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio) nbest_hyps = [ h.asdict() for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)] ] new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) with open(args.result_label, 'wb') as f: f.write( json.dumps({ 'utts': new_js }, indent=4, ensure_ascii=False, sort_keys=True).encode('utf_8'))
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, ): assert check_argument_types() # 1. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device ) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device ) scorers["lm"] = lm.lm # 3. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.lm_train_args = lm_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def test_beam_search_equal( model_class, args, ctc_weight, lm_weight, bonus, device, dtype ): if device == "cuda" and not torch.cuda.is_available(): pytest.skip("no cuda device is available") if device == "cpu" and dtype == "float16": pytest.skip("cpu float16 implementation is not available in pytorch yet") # seed setting torch.manual_seed(123) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = ( False # https://github.com/pytorch/pytorch/issues/6351 ) dtype = getattr(torch, dtype) model, x, ilens, y, data, train_args = prepare( model_class, args, mtlalpha=ctc_weight ) model.eval() char_list = train_args.char_list lm_args = Namespace(type="lstm", layer=1, unit=2, embed_unit=2, dropout_rate=0.0) lm = dynamic_import_lm("default", backend="pytorch")(len(char_list), lm_args) lm.eval() # test previous beam search args = Namespace( beam_size=3, penalty=bonus, ctc_weight=ctc_weight, maxlenratio=0, lm_weight=lm_weight, minlenratio=0, nbest=5, ) feat = x[0, : ilens[0]].numpy() # legacy beam search with torch.no_grad(): nbest = model.recognize(feat, args, char_list, lm.model) # new beam search scorers = model.scorers() if lm_weight != 0: scorers["lm"] = lm scorers["length_bonus"] = LengthBonus(len(char_list)) weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=args.lm_weight, length_bonus=args.penalty, ) model.to(device, dtype=dtype) model.eval() beam = BeamSearch( beam_size=args.beam_size, vocab_size=len(char_list), weights=weights, scorers=scorers, token_list=train_args.char_list, sos=model.sos, eos=model.eos, pre_beam_score_key=None if ctc_weight == 1.0 else "decoder", ) beam.to(device, dtype=dtype) beam.eval() with torch.no_grad(): enc = model.encode(torch.as_tensor(feat).to(device, dtype=dtype)) nbest_bs = beam( x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio ) if dtype == torch.float16: # skip because results are different. just checking it is decodable return for i, (expected, actual) in enumerate(zip(nbest, nbest_bs)): actual = actual.asdict() assert expected["yseq"] == actual["yseq"] numpy.testing.assert_allclose(expected["score"], actual["score"], rtol=1e-6)
def recog_v2(args): """Decode with custom models that implements ScorerInterface. Notes: The previous backend espnet.asr.pytorch_backend.asr.recog only supports E2E and RNNLM Args: args (namespace): The program arguments. See py:func:`espnet.bin.asr_recog.get_parser` for details """ logging.warning("experimental API for custom LMs is selected by --api v2") if args.batchsize > 1: raise NotImplementedError( "multi-utt batch decoding is not implemented") if args.streaming_mode is not None: raise NotImplementedError("streaming mode is not implemented") if args.word_rnnlm: raise NotImplementedError("word LM is not implemented") set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.eval() load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) if args.rnnlm: lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) # NOTE: for a compatibility with less than 0.5.0 version models lm_model_module = getattr(lm_args, "model_module", "default") lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) lm = lm_class(len(train_args.char_list), lm_args) torch_load(args.rnnlm, lm) lm.eval() else: lm = None if args.ngram_model: from espnet.nets.scorers.ngram import NgramFullScorer from espnet.nets.scorers.ngram import NgramPartScorer if args.ngram_scorer == "full": ngram = NgramFullScorer(args.ngram_model, train_args.char_list) else: ngram = NgramPartScorer(args.ngram_model, train_args.char_list) else: ngram = None scorers = model.scorers() scorers["lm"] = lm scorers["ngram"] = ngram scorers["length_bonus"] = LengthBonus(len(train_args.char_list)) weights = dict( decoder=1.0 - args.ctc_weight, ctc=args.ctc_weight, lm=args.lm_weight, ngram=args.ngram_weight, length_bonus=args.penalty, ) beam_search = BeamSearch( beam_size=args.beam_size, vocab_size=len(train_args.char_list), weights=weights, scorers=scorers, sos=model.sos, eos=model.eos, token_list=train_args.char_list, pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if args.batchsize == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning(f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") if args.ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") if args.ngpu == 1: device = "cuda" else: device = "cpu" dtype = getattr(torch, args.dtype) logging.info(f"Decoding device={device}, dtype={dtype}") model.to(device=device, dtype=dtype).eval() beam_search.to(device=device, dtype=dtype).eval() # read json data with open(args.recog_json, "r") as f: # "rb" content = f.read() if content.startswith( "Warning! You haven't set Python environment yet. Go to /content/espnet/tools and generate 'activate_python.sh'" ): train_json = json.loads( content[110:] )["utts"] # 110 is the number of characters for the above WARNING LINE. else: train_json = json.loads(content)["utts"] # json.load(f)["utts"] js = train_json # json.load(f)["utts"] new_js = {} with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] enc = model.encode( torch.as_tensor(feat).to(device=device, dtype=dtype)) nbest_hyps = beam_search(x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio) nbest_hyps = [ h.asdict() for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)] ] new_js[name] = add_results_to_json(js[name], nbest_hyps, train_args.char_list) with open(args.result_label, "wb") as f: f.write( json.dumps({ "utts": new_js }, indent=4, ensure_ascii=False, sort_keys=True).encode("utf_8"))