def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], blank_symbol: str, token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 3. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 4. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(asr_train_args, False), collate_fn=ASRTask.build_collate_fn(asr_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 6. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Encoder enc, _ = asr_model.encode(**batch) assert len(enc) == batch_size, len(enc) # c. Passed the encoder result and the beam search nbest_hyps = beam_search(x=enc[0], maxlenratio=maxlenratio, minlenratio=minlenratio) nbest_hyps = nbest_hyps[:nbest] # Only supporting batch_size==1 key = keys[0] for n in range(1, nbest + 1): hyp = nbest_hyps[n - 1] assert isinstance(hyp, Hypothesis), type(hyp) # remove sos/eos and get results token_int = hyp.yseq[1:-1].tolist() # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0, token_int)) # Change integer-ids to tokens token = converter.ids2tokens(token_int) # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each files ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if tokenizer is not None: text = tokenizer.tokens2text(token) ibest_writer["text"][key] = text
def __init__( self, asr_train_config: Union[Path, str], asr_model_file: Union[Path, str] = None, fs: int = 16000, ngpu: int = 0, batch_size: int = 1, dtype: str = "float32", kaldi_style_text: bool = True, text_converter: str = "tokenize", time_stamps: str = "auto", **ctc_segmentation_args, ): """Initialize the CTCSegmentation module. Args: asr_train_config: ASR model config file (yaml). asr_model_file: ASR model file (pth). fs: Sample rate of audio file. ngpu: Number of GPUs. Set 0 for processing on CPU, set to 1 for processing on GPU. Multi-GPU aligning is currently not implemented. Default: 0. batch_size: Currently, only batch size == 1 is implemented. dtype: Data type used for inference. Set dtype according to the ASR model. kaldi_style_text: A kaldi-style text file includes the name of the utterance at the start of the line. If True, the utterance name is expected as first word at each line. If False, utterance names are automatically generated. Set this option according to your input data. Default: True. text_converter: How CTC segmentation handles text. "tokenize": Use ESPnet 2 preprocessing to tokenize the text. "classic": The text is preprocessed as in ESPnet 1 which takes token length into account. If the ASR model has longer tokens, this option may yield better results. Default: "tokenize". time_stamps: Choose the method how the time stamps are calculated. While "fixed" and "auto" use both the sample rate, the ratio of samples to one frame is either automatically determined for each inference or fixed at a certain ratio that is initially determined by the module, but can be changed via the parameter ``samples_to_frames_ratio``. Recommended for longer audio files: "auto". **ctc_segmentation_args: Parameters for CTC segmentation. """ assert check_argument_types() # Basic settings if batch_size > 1: raise NotImplementedError("Batch decoding is not implemented") device = "cpu" if ngpu == 1: device = "cuda" elif ngpu > 1: logging.error("Multi-GPU not yet implemented.") raise NotImplementedError("Only single GPU decoding is supported") # Prepare ASR model asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.to(dtype=getattr(torch, dtype)).eval() self.preprocess_fn = ASRTask.build_preprocess_fn(asr_train_args, False) # Warn for nets with high memory consumption on long audio files if hasattr(asr_model, "encoder"): encoder_module = asr_model.encoder.__class__.__module__ else: encoder_module = "Unknown" logging.info(f"Encoder module: {encoder_module}") logging.info(f"CTC module: {asr_model.ctc.__class__.__module__}") if "rnn" not in encoder_module.lower(): logging.warning( "No RNN model detected; memory consumption may be high.") self.asr_model = asr_model self.asr_train_args = asr_train_args self.device = device self.dtype = dtype self.ctc = asr_model.ctc self.kaldi_style_text = kaldi_style_text self.token_list = asr_model.token_list # Apply configuration self.set_config( fs=fs, time_stamps=time_stamps, kaldi_style_text=kaldi_style_text, text_converter=text_converter, **ctc_segmentation_args, ) # last token "<sos/eos>", not needed self.config.char_list = asr_model.token_list[:-1]
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, sim_chunk_length: int, disable_repetition_detection: bool, encoded_feat_length_limit: int, decoder_text_length_limit: int, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text = Speech2TextStreaming( asr_train_config=asr_train_config, asr_model_file=asr_model_file, lm_train_config=lm_train_config, lm_file=lm_file, token_type=token_type, bpemodel=bpemodel, device=device, maxlenratio=maxlenratio, minlenratio=minlenratio, dtype=dtype, beam_size=beam_size, ctc_weight=ctc_weight, lm_weight=lm_weight, penalty=penalty, nbest=nbest, disable_repetition_detection=disable_repetition_detection, decoder_text_length_limit=decoder_text_length_limit, encoded_feat_length_limit=encoded_feat_length_limit, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } assert len(batch.keys()) == 1 try: if sim_chunk_length == 0: # N-best list of (text, token, token_int, hyp_object) results = speech2text(**batch) else: speech = batch["speech"] for i in range(len(speech) // sim_chunk_length): speech2text( speech=speech[i * sim_chunk_length:(i + 1) * sim_chunk_length], is_final=False, ) results = speech2text(speech[(i + 1) * sim_chunk_length:len(speech)], is_final=True) except TooShortUttError as e: logging.warning(f"Utterance {keys} {e}") hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) results = [[" ", ["<space>"], [2], hyp]] * nbest # Only supporting batch_size==1 key = keys[0] for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each file ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if text is not None: ibest_writer["text"][key] = text
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, model_tag: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, maskctc_n_iterations: int, maskctc_threshold_probability: float, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text_kwargs = dict( asr_train_config=asr_train_config, asr_model_file=asr_model_file, token_type=token_type, bpemodel=bpemodel, device=device, batch_size=batch_size, dtype=dtype, maskctc_n_iterations=maskctc_n_iterations, maskctc_threshold_probability=maskctc_threshold_probability, ) speech2text = Speech2Text.from_pretrained( model_tag=model_tag, **speech2text_kwargs, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 7 .Start for-loop with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } try: results = speech2text(**batch) except TooShortUttError as e: logging.warning(f"Utterance {keys} {e}") hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) results = [[" ", ["<space>"], [2], hyp]] # Only supporting batch_size==1 key = keys[0] (text, token, token_int, hyp) = results[0] # Create a directory: outdir/{n}best_recog ibest_writer = writer["1best_recog"] # Write the result to each file ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if text is not None: ibest_writer["text"][key] = text
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, streaming: bool, ): assert check_argument_types() if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text = k2Speech2Text( asr_train_config=asr_train_config, asr_model_file=asr_model_file, lm_train_config=lm_train_config, lm_file=lm_file, token_type=token_type, bpemodel=bpemodel, device=device, maxlenratio=maxlenratio, minlenratio=minlenratio, dtype=dtype, beam_size=beam_size, ctc_weight=ctc_weight, lm_weight=lm_weight, penalty=penalty, nbest=nbest, streaming=streaming, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) with DatadirWriter(output_dir) as writer: for batch_idx, (keys, batch) in enumerate(loader): if batch_idx % 10 == 0: logging.info(f"Processing {batch_idx} batch") assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" # 1-best list of (text, token, token_int) results = speech2text(batch) for key_idx, (text, token, token_int, score) in enumerate(results): key = keys[key_idx] best_writer = writer["1best_recog"] # Write the result to each file best_writer["token"][key] = " ".join(token) best_writer["token_int"][key] = " ".join(map(str, token_int)) best_writer["score"][key] = str(score) if text is not None: best_writer["text"][key] = text