def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, disable_repetition_detection=False, decoder_text_length_limit=0, encoded_feat_length_limit=0, ): assert check_argument_types() # 1. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.to(dtype=getattr(torch, dtype)).eval() assert isinstance(asr_model.encoder, ContextualBlockTransformerEncoder) or isinstance( asr_model.encoder, ContextualBlockConformerEncoder) decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 3. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) assert "encoder_conf" in asr_train_args assert "look_ahead" in asr_train_args.encoder_conf assert "hop_size" in asr_train_args.encoder_conf assert "block_size" in asr_train_args.encoder_conf # look_ahead = asr_train_args.encoder_conf['look_ahead'] # hop_size = asr_train_args.encoder_conf['hop_size'] # block_size = asr_train_args.encoder_conf['block_size'] assert batch_size == 1 beam_search = BatchBeamSearchOnline( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", disable_repetition_detection=disable_repetition_detection, decoder_text_length_limit=decoder_text_length_limit, encoded_feat_length_limit=encoded_feat_length_limit, ) non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] assert len(non_batch) == 0 # TODO(karita): make all scorers batchfied logging.info("BatchBeamSearchOnline implementation is selected.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest if "n_fft" in asr_train_args.frontend_conf: self.n_fft = asr_train_args.frontend_conf["n_fft"] else: self.n_fft = 512 if "hop_length" in asr_train_args.frontend_conf: self.hop_length = asr_train_args.frontend_conf["hop_length"] else: self.hop_length = 128 if ("win_length" in asr_train_args.frontend_conf and asr_train_args.frontend_conf["win_length"] is not None): self.win_length = asr_train_args.frontend_conf["win_length"] else: self.win_length = self.n_fft self.reset()
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], blank_symbol: str, token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 3. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 4. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(asr_train_args, False), collate_fn=ASRTask.build_collate_fn(asr_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 6. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Encoder enc, _ = asr_model.encode(**batch) assert len(enc) == batch_size, len(enc) # c. Passed the encoder result and the beam search nbest_hyps = beam_search(x=enc[0], maxlenratio=maxlenratio, minlenratio=minlenratio) nbest_hyps = nbest_hyps[:nbest] # Only supporting batch_size==1 key = keys[0] for n in range(1, nbest + 1): hyp = nbest_hyps[n - 1] assert isinstance(hyp, Hypothesis), type(hyp) # remove sos/eos and get results token_int = hyp.yseq[1:-1].tolist() # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0, token_int)) # Change integer-ids to tokens token = converter.ids2tokens(token_int) # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each files ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if tokenizer is not None: text = tokenizer.tokens2text(token) ibest_writer["text"][key] = text
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, streaming: bool = False, ): assert check_argument_types() # 1. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.to(dtype=getattr(torch, dtype)).eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 3. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: if streaming: beam_search.__class__ = BatchBeamSearchOnlineSim beam_search.set_streaming_config(asr_train_config) logging.info( "BatchBeamSearchOnlineSim implementation is selected.") else: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning(f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def calc_perplexity( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], log_base: Optional[float], allow_variable_data_keys: bool, ): assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build LM model, train_args = LMTask.build_model_from_file(train_config, model_file, device) # Wrape model to make model.nll() data-parallel wrapped_model = ForwardAdaptor(model, "nll") wrapped_model.to(dtype=getattr(torch, dtype)).eval() logging.info(f"Model:\n{model}") # 3. Build data-iterator loader = LMTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=LMTask.build_preprocess_fn(train_args, False), collate_fn=LMTask.build_collate_fn(train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Start for-loop with DatadirWriter(output_dir) as writer: total_nll = 0.0 total_ntokens = 0 for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): batch = to_device(batch, device) if ngpu <= 1: # NOTE(kamo): data_parallel also should work with ngpu=1, # but for debuggability it's better to keep this block. nll, lengths = wrapped_model(**batch) else: nll, lengths = data_parallel( wrapped_model, (), range(ngpu), module_kwargs=batch ) assert _bs == len(nll) == len(lengths), (_bs, len(nll), len(lengths)) # nll: (B, L) -> (B,) nll = nll.detach().cpu().numpy().sum(1) # lengths: (B,) lengths = lengths.detach().cpu().numpy() total_nll += nll.sum() total_ntokens += lengths.sum() for key, _nll, ntoken in zip(keys, nll, lengths): if log_base is None: utt_ppl = np.exp(_nll / ntoken) else: utt_ppl = log_base ** (_nll / ntoken / np.log(log_base)) # Write PPL of each utts for debugging or analysis writer["utt2ppl"][key] = str(utt_ppl) writer["utt2ntokens"][key] = str(ntoken) if log_base is None: ppl = np.exp(total_nll / total_ntokens) else: ppl = log_base ** (total_nll / total_ntokens / np.log(log_base)) with (Path(output_dir) / "ppl").open("w", encoding="utf-8") as f: f.write(f"{ppl}\n") with (Path(output_dir) / "base").open("w", encoding="utf-8") as f: if log_base is None: _log_base = np.e else: _log_base = log_base f.write(f"{_log_base}\n") logging.info(f"PPL={ppl}")
def __init__( self, asr_train_config: Union[Path, str] = None, asr_model_file: Union[Path, str] = None, transducer_conf: dict = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, ngram_scorer: str = "full", ngram_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, ngram_weight: float = 0.9, penalty: float = 0.0, nbest: int = 1, streaming: bool = False, enh_s2t_task: bool = False, quantize_asr_model: bool = False, quantize_lm: bool = False, quantize_modules: List[str] = ["Linear"], quantize_dtype: str = "qint8", ): assert check_argument_types() task = ASRTask if not enh_s2t_task else EnhS2TTask if quantize_asr_model or quantize_lm: if quantize_dtype == "float16" and torch.__version__ < LooseVersion( "1.5.0"): raise ValueError( "float16 dtype for dynamic quantization is not supported with " "torch version < 1.5.0. Switch to qint8 dtype instead.") quantize_modules = set( [getattr(torch.nn, q) for q in quantize_modules]) quantize_dtype = getattr(torch, quantize_dtype) # 1. Build ASR model scorers = {} asr_model, asr_train_args = task.build_model_from_file( asr_train_config, asr_model_file, device) if enh_s2t_task: asr_model.inherite_attributes(inherite_s2t_attrs=[ "ctc", "decoder", "eos", "joint_network", "sos", "token_list", "use_transducer_decoder", ]) asr_model.to(dtype=getattr(torch, dtype)).eval() if quantize_asr_model: logging.info("Use quantized asr model for decoding.") asr_model = torch.quantization.quantize_dynamic( asr_model, qconfig_spec=quantize_modules, dtype=quantize_dtype) decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) if quantize_lm: logging.info("Use quantized lm for decoding.") lm = torch.quantization.quantize_dynamic( lm, qconfig_spec=quantize_modules, dtype=quantize_dtype) scorers["lm"] = lm.lm # 3. Build ngram model if ngram_file is not None: if ngram_scorer == "full": from espnet.nets.scorers.ngram import NgramFullScorer ngram = NgramFullScorer(ngram_file, token_list) else: from espnet.nets.scorers.ngram import NgramPartScorer ngram = NgramPartScorer(ngram_file, token_list) else: ngram = None scorers["ngram"] = ngram # 4. Build BeamSearch object if asr_model.use_transducer_decoder: beam_search_transducer = BeamSearchTransducer( decoder=asr_model.decoder, joint_network=asr_model.joint_network, beam_size=beam_size, lm=scorers["lm"] if "lm" in scorers else None, lm_weight=lm_weight, **transducer_conf, ) beam_search = None else: beam_search_transducer = None weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, ngram=ngram_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key=None if ctc_weight == 1.0 else "full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: if streaming: beam_search.__class__ = BatchBeamSearchOnlineSim beam_search.set_streaming_config(asr_train_config) logging.info( "BatchBeamSearchOnlineSim implementation is selected." ) else: beam_search.__class__ = BatchBeamSearch logging.info( "BatchBeamSearch implementation is selected.") else: logging.warning( f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.beam_search_transducer = beam_search_transducer self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def __init__( self, mt_train_config: Union[Path, str] = None, mt_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, ngram_scorer: str = "full", ngram_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 20, lm_weight: float = 1.0, ngram_weight: float = 0.9, penalty: float = 0.0, nbest: int = 1, ): assert check_argument_types() # 1. Build MT model scorers = {} mt_model, mt_train_args = MTTask.build_model_from_file( mt_train_config, mt_model_file, device) mt_model.to(dtype=getattr(torch, dtype)).eval() decoder = mt_model.decoder token_list = mt_model.token_list scorers.update( decoder=decoder, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 3. Build ngram model if ngram_file is not None: if ngram_scorer == "full": from espnet.nets.scorers.ngram import NgramFullScorer ngram = NgramFullScorer(ngram_file, token_list) else: from espnet.nets.scorers.ngram import NgramPartScorer ngram = NgramPartScorer(ngram_file, token_list) else: ngram = None scorers["ngram"] = ngram # 4. Build BeamSearch object weights = dict( decoder=1.0, lm=lm_weight, ngram=ngram_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=mt_model.sos, eos=mt_model.eos, vocab_size=len(token_list), token_list=token_list, pre_beam_score_key="full", ) # TODO(karita): make all scorers batchfied if batch_size == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning(f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation.") beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = mt_train_args.token_type if bpemodel is None: bpemodel = mt_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.mt_model = mt_model self.mt_train_args = mt_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, batch_size: int = 1, dtype: str = "float32", beam_size: int = 8, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, streaming: bool = False, search_beam_size: int = 20, output_beam_size: int = 20, min_active_states: int = 30, max_active_states: int = 10000, blank_bias: float = 0.0, lattice_weight: float = 1.0, is_ctc_decoding: bool = True, lang_dir: Optional[str] = None, use_fgram_rescoring: bool = False, use_nbest_rescoring: bool = False, am_weight: float = 1.0, decoder_weight: float = 0.5, nnlm_weight: float = 1.0, num_paths: int = 1000, nbest_batch_size: int = 500, nll_batch_size: int = 100, ): assert check_argument_types() # 1. Build ASR model asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device ) asr_model.to(dtype=getattr(torch, dtype)).eval() token_list = asr_model.token_list # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device ) self.lm = lm self.is_ctc_decoding = is_ctc_decoding self.use_fgram_rescoring = use_fgram_rescoring self.use_nbest_rescoring = use_nbest_rescoring assert self.is_ctc_decoding, "Currently, only ctc_decoding graph is supported." if self.is_ctc_decoding: self.decode_graph = k2.arc_sort( build_ctc_topo(list(range(len(token_list)))) ) self.decode_graph = self.decode_graph.to(device) if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") logging.info(f"Running on : {device}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.converter = converter self.tokenizer = tokenizer self.device = device self.dtype = dtype self.search_beam_size = search_beam_size self.output_beam_size = output_beam_size self.min_active_states = min_active_states self.max_active_states = max_active_states self.blank_bias = blank_bias self.lattice_weight = lattice_weight self.am_weight = am_weight self.decoder_weight = decoder_weight self.nnlm_weight = nnlm_weight self.num_paths = num_paths self.nbest_batch_size = nbest_batch_size self.nll_batch_size = nll_batch_size
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", maxlenratio: float = 0.0, minlenratio: float = 0.0, dtype: str = "float32", beam_size: int = 20, ctc_weight: float = 0.5, lm_weight: float = 1.0, penalty: float = 0.0, nbest: int = 1, ): assert check_argument_types() # 1. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device ) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 2. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device ) scorers["lm"] = lm.lm # 3. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 4. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.lm_train_args = lm_train_args self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.maxlenratio = maxlenratio self.minlenratio = minlenratio self.device = device self.dtype = dtype self.nbest = nbest
def __init__( self, asr_train_config: Union[Path, str] = None, asr_model_file: Union[Path, str] = None, beam_search_config: Dict[str, Any] = None, lm_train_config: Union[Path, str] = None, lm_file: Union[Path, str] = None, token_type: str = None, bpemodel: str = None, device: str = "cpu", beam_size: int = 5, dtype: str = "float32", lm_weight: float = 1.0, quantize_asr_model: bool = False, quantize_modules: List[str] = None, quantize_dtype: str = "qint8", nbest: int = 1, streaming: bool = False, chunk_size: int = 16, left_context: int = 32, right_context: int = 0, display_partial_hypotheses: bool = False, ) -> None: assert check_argument_types() asr_model, asr_train_args = ASRTransducerTask.build_model_from_file( asr_train_config, asr_model_file, device) if quantize_asr_model: if quantize_modules is not None: if not all([q in ["LSTM", "Linear"] for q in quantize_modules]): raise ValueError( "Only 'Linear' and 'LSTM' modules are currently supported" " by PyTorch and in --quantize_modules") q_config = set( [getattr(torch.nn, q) for q in quantize_modules]) else: q_config = {torch.nn.Linear} if quantize_dtype == "float16" and (V(torch.__version__) < V("1.5.0")): raise ValueError( "float16 dtype for dynamic quantization is not supported with torch" " version < 1.5.0. Switching to qint8 dtype instead.") q_dtype = getattr(torch, quantize_dtype) asr_model = torch.quantization.quantize_dynamic( asr_model, q_config, dtype=q_dtype).eval() else: asr_model.to(dtype=getattr(torch, dtype)).eval() if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) lm_scorer = lm.lm else: lm_scorer = None # 4. Build BeamSearch object if beam_search_config is None: beam_search_config = {} beam_search = BeamSearchTransducer( asr_model.decoder, asr_model.joint_network, beam_size, lm=lm_scorer, lm_weight=lm_weight, nbest=nbest, **beam_search_config, ) token_list = asr_model.token_list if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") self.asr_model = asr_model self.asr_train_args = asr_train_args self.device = device self.dtype = dtype self.nbest = nbest self.converter = converter self.tokenizer = tokenizer self.beam_search = beam_search self.streaming = streaming self.chunk_size = max(chunk_size, 0) self.left_context = max(left_context, 0) self.right_context = max(right_context, 0) if not streaming or chunk_size == 0: self.streaming = False self.asr_model.encoder.dynamic_chunk_training = False self.n_fft = asr_train_args.frontend_conf.get("n_fft", 512) self.hop_length = asr_train_args.frontend_conf.get("hop_length", 128) if asr_train_args.frontend_conf.get("win_length", None) is not None: self.frontend_window_size = asr_train_args.frontend_conf[ "win_length"] else: self.frontend_window_size = self.n_fft self.window_size = self.chunk_size + self.right_context self._raw_ctx = self.asr_model.encoder.get_encoder_input_raw_size( self.window_size, self.hop_length) self.last_chunk_length = (self.asr_model.encoder.embed.min_frame_length + self.right_context + 1) * self.hop_length self.reset_inference_cache()