def prepare_data(args): audiodirs = [Path(audiodir).expanduser().resolve() for audiodir in args.audiodirs] if args.uttid_prefix: audios = { "_".join([args.uttid_prefix, str(path.parent.stem), str(path.stem)]): str( path ) for audiodir in audiodirs for path in audiodir.rglob("*.wav") } else: audios = { "_".join([path.parent, path.stem]): str(path) for audiodir in audiodirs for path in audiodir.rglob("*.wav") } with DatadirWriter(args.outdir) as writer: for uttid, utt_path in audios.items(): writer["wav.scp"][uttid] = utt_path writer["spk1.scp"][uttid] = utt_path writer["utt2spk"][uttid] = uttid
def test_DatadirWriter(tmp_path: Path): writer = DatadirWriter(tmp_path) # enter(), __exit__(), close() with writer as f: # __getitem__() sub = f["aa"] # __setitem__() sub["bb"] = "aa" with pytest.raises(TypeError): sub["bb"] = 1 with pytest.raises(RuntimeError): # Already has children f["aa"] = "dd" with pytest.raises(RuntimeError): # Is a text sub["cc"] # Create a directory, but set mismatched ids f["aa2"]["ccccc"] = "aaa" # Duplicated warning f["aa2"]["ccccc"] = "def"
def prepare_data(args): config_file = Path(args.config_file).expanduser().resolve() audiodirs = [ Path(audiodir).expanduser().resolve() for audiodir in args.audiodirs ] audios = { path.stem: str(path) for audiodir in audiodirs for path in audiodir.rglob("*.wav") } suffix = "_" + args.uttid_suffix if args.uttid_suffix else "" with DatadirWriter(args.outdir) as writer, config_file.open("r") as f: for line in f: line = line.strip() if not line: continue path_clean, start_time, path_noise, path_rir, snr, scale = line.split( ) uttid = "#".join([ Path(path_clean).stem, Path(path_noise).stem, Path(path_rir).stem, start_time, snr, scale, ]) writer["wav.scp"][uttid + suffix] = audios[uttid] if args.use_reverb_ref: repl = r"/reverb_ref/\1" else: repl = r"/noreverb_ref/\1" writer["spk1.scp"][uttid + suffix] = re.sub( r"/mix/([^\\]+\.wav$)", repl, audios[uttid]) if "librispeech" in path_clean: spkid = "-".join(path_clean.split("/")[-3:-1]) else: spkid = path_clean.split("/")[-2] writer["utt2spk"][uttid + suffix] = spkid
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], blank_symbol: str, token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build ASR model scorers = {} asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, device) asr_model.eval() decoder = asr_model.decoder ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) token_list = asr_model.token_list scorers.update( decoder=decoder, ctc=ctc, length_bonus=LengthBonus(len(token_list)), ) # 3. Build Language model if lm_train_config is not None: lm, lm_train_args = LMTask.build_model_from_file( lm_train_config, lm_file, device) scorers["lm"] = lm.lm # 4. Build BeamSearch object weights = dict( decoder=1.0 - ctc_weight, ctc=ctc_weight, lm=lm_weight, length_bonus=penalty, ) beam_search = BeamSearch( beam_size=beam_size, weights=weights, scorers=scorers, sos=asr_model.sos, eos=asr_model.eos, vocab_size=len(token_list), token_list=token_list, ) beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() logging.info(f"Beam_search: {beam_search}") logging.info(f"Decoding device={device}, dtype={dtype}") # 5. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(asr_train_args, False), collate_fn=ASRTask.build_collate_fn(asr_train_args), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 6. [Optional] Build Text converter: e.g. bpe-sym -> Text if token_type is None: token_type = asr_train_args.token_type if bpemodel is None: bpemodel = asr_train_args.bpemodel if token_type is None: tokenizer = None elif token_type == "bpe": if bpemodel is not None: tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) else: tokenizer = None else: tokenizer = build_tokenizer(token_type=token_type) converter = TokenIDConverter(token_list=token_list) logging.info(f"Text tokenizer: {tokenizer}") # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): # a. To device batch = to_device(batch, device) # b. Forward Encoder enc, _ = asr_model.encode(**batch) assert len(enc) == batch_size, len(enc) # c. Passed the encoder result and the beam search nbest_hyps = beam_search(x=enc[0], maxlenratio=maxlenratio, minlenratio=minlenratio) nbest_hyps = nbest_hyps[:nbest] # Only supporting batch_size==1 key = keys[0] for n in range(1, nbest + 1): hyp = nbest_hyps[n - 1] assert isinstance(hyp, Hypothesis), type(hyp) # remove sos/eos and get results token_int = hyp.yseq[1:-1].tolist() # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0, token_int)) # Change integer-ids to tokens token = converter.ids2tokens(token_int) # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each files ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if tokenizer is not None: text = tokenizer.tokens2text(token) ibest_writer["text"][key] = text
def scoring( output_dir: str, dtype: str, log_level: Union[int, str], key_file: str, ref_scp: List[str], inf_scp: List[str], ref_channel: int, ): assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) assert len(ref_scp) == len(inf_scp), ref_scp num_spk = len(ref_scp) keys = [ line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8") ] ref_readers = [ SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp ] inf_readers = [ SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp ] # get sample rate sample_rate, _ = ref_readers[0][keys[0]] # check keys for inf_reader, ref_reader in zip(inf_readers, ref_readers): assert inf_reader.keys() == ref_reader.keys() with DatadirWriter(output_dir) as writer: for key in keys: ref_audios = [ref_reader[key][1] for ref_reader in ref_readers] inf_audios = [inf_reader[key][1] for inf_reader in inf_readers] ref = np.array(ref_audios) inf = np.array(inf_audios) if ref.ndim > inf.ndim: # multi-channel reference and single-channel output ref = ref[..., ref_channel] assert ref.shape == inf.shape, (ref.shape, inf.shape) elif ref.ndim < inf.ndim: # single-channel reference and multi-channel output raise ValueError("Reference must be multi-channel when the \ network output is multi-channel.") elif ref.ndim == inf.ndim == 3: # multi-channel reference and output ref = ref[..., ref_channel] inf = inf[..., ref_channel] sdr, sir, sar, perm = bss_eval_sources(ref, inf, compute_permutation=True) for i in range(num_spk): stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate) si_snr_score = -float( si_snr_loss( torch.from_numpy(ref[i][None, ...]), torch.from_numpy(inf[int(perm[i])][None, ...]), )) writer[f"STOI_spk{i + 1}"][key] = str(stoi_score) writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score) writer[f"SDR_spk{i + 1}"][key] = str(sdr[i]) writer[f"SAR_spk{i + 1}"][key] = str(sar[i]) writer[f"SIR_spk{i + 1}"][key] = str(sir[i]) # save permutation assigned script file writer[f"wav_spk{i + 1}"][key] = inf_readers[perm[i]].data[key]
def scoring( output_dir: str, dtype: str, log_level: Union[int, str], key_file: str, ref_scp: List[str], inf_scp: List[str], ref_channel: int, metrics: List[str], frame_size: int = 512, frame_hop: int = 256, ): assert check_argument_types() for metric in metrics: assert metric in ( "STOI", "ESTOI", "SNR", "SI_SNR", "SDR", "SAR", "SIR", "framewise-SNR", ), metric logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) assert len(ref_scp) == len(inf_scp), ref_scp num_spk = len(ref_scp) keys = [ line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8") ] ref_readers = [ SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp ] inf_readers = [ SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp ] # get sample rate fs, _ = ref_readers[0][keys[0]] # check keys for inf_reader, ref_reader in zip(inf_readers, ref_readers): assert inf_reader.keys() == ref_reader.keys() stft = STFTEncoder(n_fft=frame_size, hop_length=frame_hop) do_bss_eval = "SDR" in metrics or "SAR" in metrics or "SIR" in metrics with DatadirWriter(output_dir) as writer: for key in keys: ref_audios = [ref_reader[key][1] for ref_reader in ref_readers] inf_audios = [inf_reader[key][1] for inf_reader in inf_readers] ref = np.array(ref_audios) inf = np.array(inf_audios) if ref.ndim > inf.ndim: # multi-channel reference and single-channel output ref = ref[..., ref_channel] assert ref.shape == inf.shape, (ref.shape, inf.shape) elif ref.ndim < inf.ndim: # single-channel reference and multi-channel output raise ValueError("Reference must be multi-channel when the " "network output is multi-channel.") elif ref.ndim == inf.ndim == 3: # multi-channel reference and output ref = ref[..., ref_channel] inf = inf[..., ref_channel] if do_bss_eval or num_spk > 1: sdr, sir, sar, perm = bss_eval_sources( ref, inf, compute_permutation=True) else: perm = [0] ilens = torch.LongTensor([ref.shape[1]]) # (num_spk, T, F) ref_spec, flens = stft(torch.from_numpy(ref), ilens) inf_spec, _ = stft(torch.from_numpy(inf), ilens) for i in range(num_spk): p = int(perm[i]) for metric in metrics: name = f"{metric}_spk{i + 1}" if metric == "STOI": writer[name][key] = str( stoi(ref[i], inf[p], fs_sig=fs, extended=False)) elif metric == "ESTOI": writer[name][key] = str( stoi(ref[i], inf[p], fs_sig=fs, extended=True)) elif metric == "SNR": si_snr_score = -float( ESPnetEnhancementModel.snr_loss( torch.from_numpy(ref[i][None, ...]), torch.from_numpy(inf[p][None, ...]), )) writer[name][key] = str(si_snr_score) elif metric == "SI_SNR": si_snr_score = -float( ESPnetEnhancementModel.si_snr_loss( torch.from_numpy(ref[i][None, ...]), torch.from_numpy(inf[p][None, ...]), )) writer[name][key] = str(si_snr_score) elif metric == "SDR": writer[name][key] = str(sdr[i]) elif metric == "SAR": writer[name][key] = str(sar[i]) elif metric == "SIR": writer[name][key] = str(sir[i]) elif metric == "framewise-SNR": framewise_snr = -ESPnetEnhancementModel.snr_loss( ref_spec[i].abs(), inf_spec[i].abs()) writer[name][key] = " ".join( map(str, framewise_snr.tolist())) else: raise ValueError("Unsupported metric: %s" % metric) # save permutation assigned script file writer[f"wav_spk{i + 1}"][key] = inf_readers[ perm[i]].data[key]
def collect_stats( model: AbsESPnetModel, train_iter: DataLoader and Iterable[Tuple[List[str], Dict[str, torch.Tensor]]], valid_iter: DataLoader and Iterable[Tuple[List[str], Dict[str, torch.Tensor]]], output_dir: Path, ngpu: Optional[int], log_interval: Optional[int], write_collected_feats: bool, ) -> None: """Perform on collect_stats mode. Running for deriving the shape information from data and gathering statistics. This method is used before executing train(). """ assert check_argument_types() npy_scp_writers = {} for itr, mode in zip([train_iter, valid_iter], ["train", "valid"]): if log_interval is None: try: log_interval = max(len(itr) // 20, 10) except TypeError: log_interval = 100 sum_dict = defaultdict(lambda: 0) sq_dict = defaultdict(lambda: 0) count_dict = defaultdict(lambda: 0) with DatadirWriter(output_dir / mode) as datadir_writer: for iiter, (keys, batch) in enumerate(itr, 1): batch = to_device(batch, "cuda" if ngpu > 0 else "cpu") # 1. Write shape file for name in batch: if name.endswith("_lengths"): continue for i, (key, data) in enumerate(zip(keys, batch[name])): if f"{name}_lengths" in batch: lg = int(batch[f"{name}_lengths"][i]) data = data[:lg] datadir_writer[f"{name}_shape"][key] = ",".join( map(str, data.shape) ) # 2. Extract feats if ngpu <= 1: data = model.collect_feats(**batch) else: # Note that data_parallel can parallelize only "forward()" data = data_parallel( ForwardAdaptor(model, "collect_feats"), (), range(ngpu), module_kwargs=batch, ) # 3. Calculate sum and square sum for key, v in data.items(): for i, (uttid, seq) in enumerate(zip(keys, v.cpu().numpy())): # Truncate zero-padding region if f"{key}_lengths" in data: length = data[f"{key}_lengths"][i] # seq: (Length, Dim, ...) seq = seq[:length] else: # seq: (Dim, ...) -> (1, Dim, ...) seq = seq[None] # Accumulate value, its square, and count sum_dict[key] += seq.sum(0) sq_dict[key] += (seq ** 2).sum(0) count_dict[key] += len(seq) # 4. [Option] Write derived features as npy format file. if write_collected_feats: # Instantiate NpyScpWriter for the first iteration if (key, mode) not in npy_scp_writers: p = output_dir / mode / "collect_feats" npy_scp_writers[(key, mode)] = NpyScpWriter( p / f"data_{key}", p / f"{key}.scp" ) # Save array as npy file npy_scp_writers[(key, mode)][uttid] = seq if iiter % log_interval == 0: logging.info(f"Niter: {iiter}") for key in sum_dict: np.savez( output_dir / mode / f"{key}_stats.npz", count=count_dict[key], sum=sum_dict[key], sum_square=sq_dict[key], ) # batch_keys and stats_keys are used by aggregate_stats_dirs.py with (output_dir / mode / "batch_keys").open("w", encoding="utf-8") as f: f.write( "\n".join(filter(lambda x: not x.endswith("_lengths"), batch)) + "\n" ) with (output_dir / mode / "stats_keys").open("w", encoding="utf-8") as f: f.write("\n".join(sum_dict) + "\n")
def inference( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, model_tag: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, maskctc_n_iterations: int, maskctc_threshold_probability: float, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text_kwargs = dict( asr_train_config=asr_train_config, asr_model_file=asr_model_file, token_type=token_type, bpemodel=bpemodel, device=device, batch_size=batch_size, dtype=dtype, maskctc_n_iterations=maskctc_n_iterations, maskctc_threshold_probability=maskctc_threshold_probability, ) speech2text = Speech2Text.from_pretrained( model_tag=model_tag, **speech2text_kwargs, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 7 .Start for-loop with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } try: results = speech2text(**batch) except TooShortUttError as e: logging.warning(f"Utterance {keys} {e}") hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) results = [[" ", ["<space>"], [2], hyp]] # Only supporting batch_size==1 key = keys[0] (text, token, token_int, hyp) = results[0] # Create a directory: outdir/{n}best_recog ibest_writer = writer["1best_recog"] # Write the result to each file ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if text is not None: ibest_writer["text"][key] = text
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, sim_chunk_length: int, disable_repetition_detection: bool, encoded_feat_length_limit: int, decoder_text_length_limit: int, ): assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text = Speech2TextStreaming( asr_train_config=asr_train_config, asr_model_file=asr_model_file, lm_train_config=lm_train_config, lm_file=lm_file, token_type=token_type, bpemodel=bpemodel, device=device, maxlenratio=maxlenratio, minlenratio=minlenratio, dtype=dtype, beam_size=beam_size, ctc_weight=ctc_weight, lm_weight=lm_weight, penalty=penalty, nbest=nbest, disable_repetition_detection=disable_repetition_detection, decoder_text_length_limit=decoder_text_length_limit, encoded_feat_length_limit=encoded_feat_length_limit, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 7 .Start for-loop # FIXME(kamo): The output format should be discussed about with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } assert len(batch.keys()) == 1 try: if sim_chunk_length == 0: # N-best list of (text, token, token_int, hyp_object) results = speech2text(**batch) else: speech = batch["speech"] for i in range(len(speech) // sim_chunk_length): speech2text( speech=speech[i * sim_chunk_length:(i + 1) * sim_chunk_length], is_final=False, ) results = speech2text(speech[(i + 1) * sim_chunk_length:len(speech)], is_final=True) except TooShortUttError as e: logging.warning(f"Utterance {keys} {e}") hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) results = [[" ", ["<space>"], [2], hyp]] * nbest # Only supporting batch_size==1 key = keys[0] for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): # Create a directory: outdir/{n}best_recog ibest_writer = writer[f"{n}best_recog"] # Write the result to each file ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if text is not None: ibest_writer["text"][key] = text
def calc_perplexity( output_dir: str, batch_size: int, dtype: str, ngpu: int, seed: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], train_config: Optional[str], model_file: Optional[str], log_base: Optional[float], allow_variable_data_keys: bool, ): assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build LM model, train_args = LMTask.build_model_from_file(train_config, model_file, device) # Wrape model to make model.nll() data-parallel wrapped_model = ForwardAdaptor(model, "nll") wrapped_model.to(dtype=getattr(torch, dtype)).eval() logging.info(f"Model:\n{model}") # 3. Build data-iterator loader = LMTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=LMTask.build_preprocess_fn(train_args, False), collate_fn=LMTask.build_collate_fn(train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4. Start for-loop with DatadirWriter(output_dir) as writer: total_nll = 0.0 total_ntokens = 0 for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" with torch.no_grad(): batch = to_device(batch, device) if ngpu <= 1: # NOTE(kamo): data_parallel also should work with ngpu=1, # but for debuggability it's better to keep this block. nll, lengths = wrapped_model(**batch) else: nll, lengths = data_parallel( wrapped_model, (), range(ngpu), module_kwargs=batch ) assert _bs == len(nll) == len(lengths), (_bs, len(nll), len(lengths)) # nll: (B, L) -> (B,) nll = nll.detach().cpu().numpy().sum(1) # lengths: (B,) lengths = lengths.detach().cpu().numpy() total_nll += nll.sum() total_ntokens += lengths.sum() for key, _nll, ntoken in zip(keys, nll, lengths): if log_base is None: utt_ppl = np.exp(_nll / ntoken) else: utt_ppl = log_base ** (_nll / ntoken / np.log(log_base)) # Write PPL of each utts for debugging or analysis writer["utt2ppl"][key] = str(utt_ppl) writer["utt2ntokens"][key] = str(ntoken) if log_base is None: ppl = np.exp(total_nll / total_ntokens) else: ppl = log_base ** (total_nll / total_ntokens / np.log(log_base)) with (Path(output_dir) / "ppl").open("w", encoding="utf-8") as f: f.write(f"{ppl}\n") with (Path(output_dir) / "base").open("w", encoding="utf-8") as f: if log_base is None: _log_base = np.e else: _log_base = log_base f.write(f"{_log_base}\n") logging.info(f"PPL={ppl}")
def inference( output_dir: str, maxlenratio: float, minlenratio: float, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, ctc_weight: float, lm_weight: float, penalty: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], asr_train_config: str, asr_model_file: str, lm_train_config: Optional[str], lm_file: Optional[str], word_lm_train_config: Optional[str], word_lm_file: Optional[str], token_type: Optional[str], bpemodel: Optional[str], allow_variable_data_keys: bool, streaming: bool, ): assert check_argument_types() if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text = k2Speech2Text( asr_train_config=asr_train_config, asr_model_file=asr_model_file, lm_train_config=lm_train_config, lm_file=lm_file, token_type=token_type, bpemodel=bpemodel, device=device, maxlenratio=maxlenratio, minlenratio=minlenratio, dtype=dtype, beam_size=beam_size, ctc_weight=ctc_weight, lm_weight=lm_weight, penalty=penalty, nbest=nbest, streaming=streaming, ) # 3. Build data-iterator loader = ASRTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) with DatadirWriter(output_dir) as writer: for batch_idx, (keys, batch) in enumerate(loader): if batch_idx % 10 == 0: logging.info(f"Processing {batch_idx} batch") assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" # 1-best list of (text, token, token_int) results = speech2text(batch) for key_idx, (text, token, token_int, score) in enumerate(results): key = keys[key_idx] best_writer = writer["1best_recog"] # Write the result to each file best_writer["token"][key] = " ".join(token) best_writer["token_int"][key] = " ".join(map(str, token_int)) best_writer["score"][key] = str(score) if text is not None: best_writer["text"][key] = text
def scoring( output_dir: str, dtype: str, log_level: Union[int, str], key_file: str, ref_scp: List[str], inf_scp: List[str], ref_channel: int, flexible_numspk: bool, ): assert check_argument_types() logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) assert len(ref_scp) == len(inf_scp), ref_scp num_spk = len(ref_scp) keys = [ line.rstrip().split(maxsplit=1)[0] for line in open(key_file, encoding="utf-8") ] ref_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in ref_scp] inf_readers = [SoundScpReader(f, dtype=dtype, normalize=True) for f in inf_scp] # get sample rate sample_rate, _ = ref_readers[0][keys[0]] # check keys if not flexible_numspk: for inf_reader, ref_reader in zip(inf_readers, ref_readers): assert inf_reader.keys() == ref_reader.keys() with DatadirWriter(output_dir) as writer: for key in keys: if not flexible_numspk: ref_audios = [ref_reader[key][1] for ref_reader in ref_readers] inf_audios = [inf_reader[key][1] for inf_reader in inf_readers] else: ref_audios = [ ref_reader[key][1] for ref_reader in ref_readers if key in ref_reader.keys() ] inf_audios = [ inf_reader[key][1] for inf_reader in inf_readers if key in inf_reader.keys() ] ref = np.array(ref_audios) inf = np.array(inf_audios) if ref.ndim > inf.ndim: # multi-channel reference and single-channel output ref = ref[..., ref_channel] elif ref.ndim < inf.ndim: # single-channel reference and multi-channel output inf = inf[..., ref_channel] elif ref.ndim == inf.ndim == 3: # multi-channel reference and output ref = ref[..., ref_channel] inf = inf[..., ref_channel] if not flexible_numspk: assert ref.shape == inf.shape, (ref.shape, inf.shape) else: # epsilon value to avoid divergence # caused by zero-value, e.g., log(0) eps = 0.000001 # if num_spk of ref > num_spk of inf if ref.shape[0] > inf.shape[0]: p = np.full((ref.shape[0] - inf.shape[0], inf.shape[1]), eps) inf = np.concatenate([inf, p]) num_spk = ref.shape[0] # if num_spk of ref < num_spk of inf elif ref.shape[0] < inf.shape[0]: p = np.full((inf.shape[0] - ref.shape[0], ref.shape[1]), eps) ref = np.concatenate([ref, p]) num_spk = inf.shape[0] else: num_spk = ref.shape[0] sdr, sir, sar, perm = bss_eval_sources(ref, inf, compute_permutation=True) for i in range(num_spk): stoi_score = stoi(ref[i], inf[int(perm[i])], fs_sig=sample_rate) estoi_score = stoi( ref[i], inf[int(perm[i])], fs_sig=sample_rate, extended=True ) si_snr_score = -float( si_snr_loss( torch.from_numpy(ref[i][None, ...]), torch.from_numpy(inf[int(perm[i])][None, ...]), ) ) writer[f"STOI_spk{i + 1}"][key] = str(stoi_score * 100) # in percentage writer[f"ESTOI_spk{i + 1}"][key] = str(estoi_score * 100) writer[f"SI_SNR_spk{i + 1}"][key] = str(si_snr_score) writer[f"SDR_spk{i + 1}"][key] = str(sdr[i]) writer[f"SAR_spk{i + 1}"][key] = str(sar[i]) writer[f"SIR_spk{i + 1}"][key] = str(sir[i]) # save permutation assigned script file if not flexible_numspk: writer[f"wav_spk{i + 1}"][key] = inf_readers[perm[i]].data[key]
def inference( output_dir: str, batch_size: int, dtype: str, beam_size: int, ngpu: int, seed: int, lm_weight: float, nbest: int, num_workers: int, log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], asr_train_config: Optional[str], asr_model_file: Optional[str], beam_search_config: Optional[dict], lm_train_config: Optional[str], lm_file: Optional[str], model_tag: Optional[str], token_type: Optional[str], bpemodel: Optional[str], key_file: Optional[str], allow_variable_data_keys: bool, quantize_asr_model: Optional[bool], quantize_modules: Optional[List[str]], quantize_dtype: Optional[str], streaming: Optional[bool], chunk_size: Optional[int], left_context: Optional[int], right_context: Optional[int], display_partial_hypotheses: bool, ) -> None: """Transducer model inference. Args: output_dir: Output directory path. batch_size: Batch decoding size. dtype: Data type. beam_size: Beam size. ngpu: Number of GPUs. seed: Random number generator seed. lm_weight: Weight of language model. nbest: Number of final hypothesis. num_workers: Number of workers. log_level: Level of verbose for logs. data_path_and_name_and_type: asr_train_config: ASR model training config path. asr_model_file: ASR model path. beam_search_config: Beam search config path. lm_train_config: Language Model training config path. lm_file: Language Model path. model_tag: Model tag. token_type: Type of token units. bpemodel: BPE model path. key_file: File key. allow_variable_data_keys: Whether to allow variable data keys. quantize_asr_model: Whether to apply dynamic quantization to ASR model. quantize_modules: List of module names to apply dynamic quantization on. quantize_dtype: Dynamic quantization data type. streaming: Whether to perform chunk-by-chunk inference. chunk_size: Number of frames in chunk AFTER subsampling. left_context: Number of frames in left context AFTER subsampling. right_context: Number of frames in right context AFTER subsampling. display_partial_hypotheses: Whether to display partial hypotheses. """ assert check_argument_types() if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed set_all_random_seed(seed) # 2. Build speech2text speech2text_kwargs = dict( asr_train_config=asr_train_config, asr_model_file=asr_model_file, beam_search_config=beam_search_config, lm_train_config=lm_train_config, lm_file=lm_file, token_type=token_type, bpemodel=bpemodel, device=device, dtype=dtype, beam_size=beam_size, lm_weight=lm_weight, nbest=nbest, quantize_asr_model=quantize_asr_model, quantize_modules=quantize_modules, quantize_dtype=quantize_dtype, streaming=streaming, chunk_size=chunk_size, left_context=left_context, right_context=right_context, ) speech2text = Speech2Text.from_pretrained( model_tag=model_tag, **speech2text_kwargs, ) # 3. Build data-iterator loader = ASRTransducerTask.build_streaming_iterator( data_path_and_name_and_type, dtype=dtype, batch_size=batch_size, key_file=key_file, num_workers=num_workers, preprocess_fn=ASRTransducerTask.build_preprocess_fn( speech2text.asr_train_args, False), collate_fn=ASRTransducerTask.build_collate_fn( speech2text.asr_train_args, False), allow_variable_data_keys=allow_variable_data_keys, inference=True, ) # 4 .Start for-loop with DatadirWriter(output_dir) as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" batch = { k: v[0] for k, v in batch.items() if not k.endswith("_lengths") } assert len(batch.keys()) == 1 try: if speech2text.streaming: speech = batch["speech"] _steps = len(speech) // speech2text._raw_ctx _end = 0 for i in range(_steps): _end = (i + 1) * speech2text._raw_ctx speech2text.streaming_decode( speech[i * speech2text._raw_ctx:_end], is_final=False) final_hyps = speech2text.streaming_decode( speech[_end:len(speech)], is_final=True) else: final_hyps = speech2text(**batch) results = speech2text.hypotheses_to_results(final_hyps) except TooShortUttError as e: logging.warning(f"Utterance {keys} {e}") hyp = Hypothesis(score=0.0, yseq=[], dec_state=None) results = [[" ", ["<space>"], [2], hyp]] * nbest key = keys[0] for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): ibest_writer = writer[f"{n}best_recog"] ibest_writer["token"][key] = " ".join(token) ibest_writer["token_int"][key] = " ".join(map(str, token_int)) ibest_writer["score"][key] = str(hyp.score) if text is not None: ibest_writer["text"][key] = text