def __init__(self, idx_to_char, params={}): self.idx_to_char = idx_to_char self.reorder_1, self.reorder_2 = create_phone_map( params['phones_path'], idx_to_char) self.word_syms = SymbolTable.read_text(params['words_path']) self.acoustic_scale = params.get('acoustic', 1.2) if self.acoustic_scale < 0: print("Warning: acoustic scale is less than 0") allow_partial = params.get('allow_partial', True) beam = params.get('beam', 13) self.alphaweight = params.get('alphaweight', 0.3) trans_model = TransitionModel() with xopen(params['mdl_path']) as ki: trans_model.read(ki.stream(), ki.binary) decoder_opts = FasterDecoderOptions() decoder_opts.beam = beam decode_fst = read_fst_kaldi(params['fst_path']) self.decoder_opts = decoder_opts self.trans_model = trans_model self.decode_fst = decode_fst self.stats = LMStats() self.stats_state = None self.add_stats_phase = True
def create_phone_map(filename, idx_to_char): #Old code for parsing. We can read in as utf8 directly with this method # dictSave = {} # with codecs.open(filename,'r',encoding='utf8') as f: # data = f.read() # # for index, text in enumerate(data.split("\n")): # entries = re.split('\s', text, 2) # if (len(entries)<2 or len(entries[1])==0): # continue # dictSave[entries[0]] = int(entries[1]) # # dictSave['EPS'] = dictSave['NON'] # for key in DICT_MORPH: # if (dictSave.get(DICT_MORPH[key],None) is None): # continue # dictSave[key] = dictSave[DICT_MORPH[key]] # I perfer to used the library to parse the symbol # table, but it doesn't read in as utf8 ph_to_idx = {} phone_table = SymbolTable.read_text(filename) for i in range(phone_table.num_symbols()): phone_sym = phone_table.find_symbol(i).decode('utf8') ph_to_idx[phone_sym] = i ph_to_idx['EPS'] = ph_to_idx['NON'] for key in DICT_MORPH: if ph_to_idx.get(DICT_MORPH[key], None) is None: continue ph_to_idx[key] = ph_to_idx[DICT_MORPH[key]] reorder_1 = [] reorder_2 = [] for pyphnid in range(len(idx_to_char) + 1): if pyphnid == 0: a = "EPS" else: a = idx_to_char[pyphnid] a = DICT_MORPH.get(a, a) newa = ph_to_idx.get(a, None) if newa == None: continue reorder_1.append(newa - 1) reorder_2.append(pyphnid) reorder_1 = np.array(reorder_1) reorder_2 = np.array(reorder_2) return reorder_1, reorder_2
from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo from kaldi.nnet3 import NnetSimpleComputationOptions from kaldi.util.table import SequentialMatrixReader # Construct aligner decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 aligner = NnetAligner.from_files("exp/tdnn_7b_chain_online/final.mdl", "exp/tdnn_7b_chain_online/tree", "exp/langdir/L.fst", "exp/langdir/words.txt", "exp/langdir/phones/disambig.int", decodable_opts=decodable_opts) phones = SymbolTable.read_text("exp/langdir/phones.txt") wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(), "exp/langdir/phones/word_boundary.int") # Define feature pipelines as Kaldi rspecifiers feats_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |") ivectors_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |" "ivector-extract-online2 --config=conf/ivector.conf ark:data/spk2utt ark:- ark:- |" ) # Align wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i, \ open("data/text") as t, \
from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo from kaldi.nnet3 import NnetSimpleComputationOptions from kaldi.util.table import SequentialMatrixReader # Construct aligner decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 aligner = NnetAligner.from_files("exp/tdnn_7b_chain_online/final.mdl", "exp/tdnn_7b_chain_online/tree", "data/lang/L.fst", "data/lang/words.txt", "data/lang/phones/disambig.int", decodable_opts=decodable_opts) phones = SymbolTable.read_text("data/lang/phones.txt") wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(), "data/lang/phones/word_boundary.int") # Define feature pipelines as Kaldi rspecifiers feats_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |" ) ivectors_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc_hires.conf scp:data/test/wav.scp ark:- |" "ivector-extract-online2 --config=conf/ivector_extractor.conf ark:data/test/spk2utt ark:- ark:- |" ) # Align wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i, \
def gmm_decode_faster(model_rxfilename, fst_rxfilename, feature_rspecifier, words_wspecifier, alignment_wspecifier="", lattice_wspecifier="", word_symbol_table="", acoustic_scale=0.1, allow_partial=True, decoder_opts=FasterDecoderOptions()): # Read model. trans_model = TransitionModel() am_gmm = AmDiagGmm() with xopen(model_rxfilename) as ki: trans_model.read(ki.stream(), ki.binary) am_gmm.read(ki.stream(), ki.binary) # Open table readers/writers. feature_reader = SequentialMatrixReader(feature_rspecifier) words_writer = IntVectorWriter(words_wspecifier) alignment_writer = IntVectorWriter(alignment_wspecifier) clat_writer = CompactLatticeWriter(lattice_wspecifier) # Read symbol table. word_syms = None if word_symbol_table != "": word_syms = SymbolTable.read_text(word_symbol_table) if not word_syms: raise RuntimeError("Could not read symbol table from file {}" .format(word_symbol_table)) # NOTE: # It is important to read decode_fst after opening feature reader as # it can prevent crashes on systems without enough virtual memory. # Read decoding graph and instantiate decoder. decode_fst = read_fst_kaldi(fst_rxfilename) decoder = FasterDecoder(decode_fst, decoder_opts) tot_like = 0.0 frame_count = 0 num_success, num_fail = 0, 0 start = time.time() for key, features in feature_reader: if features.num_rows == 0: num_fail += 1 logging.warning("Zero-length utterance: {}".format(key)) continue gmm_decodable = DecodableAmDiagGmmScaled(am_gmm, trans_model, features, acoustic_scale) decoder.decode(gmm_decodable) if not (allow_partial or decoder.reached_final()): num_fail += 1 logging.warning("Did not successfully decode utterance {}, len = {}" .format(key, features.num_rows)) continue try: best_path = decoder.get_best_path() except RuntimeError: num_fail += 1 logging.warning("Did not successfully decode utterance {}, len = {}" .format(key, features.num_rows)) continue if not decoder.reached_final(): logging.warning("Decoder did not reach end-state, outputting " "partial traceback since --allow-partial=true") ali, words, weight = get_linear_symbol_sequence(best_path) words_writer[key] = words if alignment_writer.is_open(): alignment_writer[key] = ali if clat_writer.is_open(): if acoustic_scale != 0.0: scale = acoustic_lattice_scale(1.0 / acoustic_scale) scale_lattice(scale, best_path) best_path = convert_lattice_to_compact_lattice(best_path) clat_writer[key] = best_path if word_syms: syms = convert_indices_to_symbols(word_syms, words) print(key, " ".join(syms), file=sys.stderr) num_success += 1 frame_count += features.num_rows like = - (weight.value1 + weight.value2); tot_like += like logging.info("Log-like per frame for utterance {} is {} over {} " "frames.".format(key, like / features.num_rows, features.num_rows)) logging.debug("Cost for utterance {} is {} + {}" .format(key, weight.value1, weight.value2)) elapsed = time.time() - start logging.info("Time taken [excluding initialization] {}s: real-time factor " "assuming 100 frames/sec is {}" .format(elapsed, elapsed * 100 / frame_count)) logging.info("Done {} utterances, failed for {}" .format(num_success, num_fail)) logging.info("Overall log-likelihood per frame is {} over {} frames." .format(tot_like / frame_count, frame_count)) feature_reader.close() words_writer.close() if alignment_writer.is_open(): alignment_writer.close() if clat_writer.is_open(): clat_writer.close() return True if num_success != 0 else False
def asr(filenameS_hash, filenameS, asr_beamsize=13, asr_max_active=8000): models_dir = "models/" # Read yaml File config_file = "models/kaldi_tuda_de_nnet3_chain2.yaml" with open(config_file, 'r') as stream: model_yaml = yaml.safe_load(stream) decoder_yaml_opts = model_yaml['decoder'] scp_filename = "tmp/%s.scp" % filenameS_hash wav_filename = "tmp/%s.wav" % filenameS_hash spk2utt_filename = "tmp/%s_spk2utt" % filenameS_hash # write scp file with open(scp_filename, 'w') as scp_file: scp_file.write("%s tmp/%s.wav\n" % (filenameS_hash, filenameS_hash)) # write scp file with open(spk2utt_filename, 'w') as scp_file: scp_file.write("%s %s\n" % (filenameS_hash, filenameS_hash)) # use ffmpeg to convert the input media file (any format!) to 16 kHz wav mono ( ffmpeg .input(filename) .output("tmp/%s.wav" % filenameS_hash, acodec='pcm_s16le', ac=1, ar='16k') .overwrite_output() .run() ) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = asr_beamsize decoder_opts.max_active = asr_max_active decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( models_dir + decoder_yaml_opts["model"], models_dir + decoder_yaml_opts["fst"], models_dir + decoder_yaml_opts["word-syms"], decoder_opts=decoder_opts, decodable_opts=decodable_opts) # Construct symbol table symbols = SymbolTable.read_text(models_dir + decoder_yaml_opts["word-syms"]) phi_label = symbols.find_index("#0") # Define feature pipelines as Kaldi rspecifiers feats_rspec = ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:- |") % \ (models_dir + decoder_yaml_opts["mfcc-config"]) ivectors_rspec = ( ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:-" + " | ivector-extract-online2 --config=%s ark:" + spk2utt_filename + " ark:- ark:- |") % ((models_dir + decoder_yaml_opts["mfcc-config"]), (models_dir + decoder_yaml_opts["ivector-extraction-config"])) ) did_decode = False # Decode wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i: for (fkey, feats), (ikey, ivectors) in zip(f, i): did_decode = True assert (fkey == ikey) out = asr.decode((feats, ivectors)) best_path = functions.compact_lattice_shortest_path(out["lattice"]) words, _, _ = get_linear_symbol_sequence(shortestpath(best_path)) timing = functions.compact_lattice_to_word_alignment(best_path) assert(did_decode) # Maps words to the numbers words = indices_to_symbols(symbols, timing[0]) # Creates the datastructure (Word, begin(Frames), end(Frames)) vtt = list(map(list, zip(words, timing[1], timing[2]))) # Cleanup tmp files print('removing tmp file:', scp_filename) os.remove(scp_filename) print('removing tmp file:', wav_filename) os.remove(wav_filename) print('removing tmp file:', spk2utt_filename) os.remove(spk2utt_filename) return vtt, words
import os from kaldi.alignment import NnetAligner from kaldi.fstext import SymbolTable from kaldi.lat.align import WordBoundaryInfoNewOpts, WordBoundaryInfo from kaldi.nnet3 import NnetSimpleComputationOptions from kaldi.util.table import SequentialMatrixReader # Construct aligner decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 aligner = NnetAligner.from_files("final.mdl", "tree", "L.fst", "words.txt", "disambig.int", decodable_opts=decodable_opts) phones = SymbolTable.read_text("phones.txt") wb_info = WordBoundaryInfo.from_file(WordBoundaryInfoNewOpts(), "word_boundary.int") # Define feature pipelines as Kaldi rspecifiers feats_rspec = "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:- |" ivectors_rspec = ( "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:-" " | ivector-extract-online2 --config=ivector.conf ark:spk2utt ark:- ark:- |" ) # Align wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i, open("text") as t: for (fkey, feats), (ikey, ivectors), line in zip(f, i, t): tkey, text = line.strip().split(None, 1)
# Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files("final.mdl", "HCLG.fst", decoder_opts=decoder_opts, decodable_opts=decodable_opts) # Construct RNNLM rescorer symbols = SymbolTable.read_text("lm/words.txt") rnnlm_opts = RnnlmComputeStateComputationOptions() rnnlm_opts.bos_index = symbols.find_index("<s>") rnnlm_opts.eos_index = symbols.find_index("</s>") rnnlm_opts.brk_index = symbols.find_index("<brk>") compose_opts = ComposeLatticePrunedOptions() compose_opts.lattice_compose_beam = 4 rescorer = LatticeRnnlmPrunedRescorer.from_files( "lm/G.carpa", "rnnlm-get-word-embedding lm/word_feats.txt lm/feat_embedding.final.mat -|", "lm/final.raw", acoustic_scale=1.0, max_ngram_order=4, use_const_arpa=True, opts=rnnlm_opts, compose_opts=compose_opts)
acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary) # Define the decodable wrapper: (features, acoustic_scale) -> decodable def make_decodable_wrapper(trans_model, acoustic_model): def decodable_wrapper(features, acoustic_scale): return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features, acoustic_scale) return decodable_wrapper decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model) # Define the decoder decoding_graph = read_fst_kaldi("models/mono/graph/HCLG.fst") decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13.0 decoder_opts.lattice_beam = 6.0 decoder = LatticeFasterDecoder(decoding_graph, decoder_opts) # Define the recognizer symbols = SymbolTable.read_text("models/mono/graph/words.txt") asr = Recognizer(decoder, decodable_wrapper, symbols) # Decode wave files # for key, wav in SequentialWaveReader("scp:wav.scp"): # feats = feat_pipeline(wav) # out = asr.decode(feats) # print(key, out["text"], flush=True)
sample_list_path = 'epadb_full_path_list' epadb_root_path = 'EpaDB' mfccs_rspec = ("ark:" + data_path + "/mfccs.ark") ivectors_rspec = ("ark:" + data_path + "/ivectors.ark") loglikes_wspec = "ark:gop/loglikes.ark" aligner = MappedAligner.from_files(transition_model_path, tree, lang_graph, symbols_path, disam, acoustic_scale=1.0) phones = SymbolTable.read_text(phones) wb_info = WordBoundaryInfo.from_file( WordBoundaryInfoNewOpts(), "data/lang_test_tgsmall/phones/word_boundary.int") # Instantiate the PyTorch acoustic model (subclass of torch.nn.Module) model = FTDNN() model.load_state_dict(torch.load(acoustic_model_path)) model.eval() #Create feature manager feature_manager = FeatureManager(epadb_root_path, data_path, conf_path) align_out_file = open("gop/align_output", "w+") # Decode and write output lattices with DoubleMatrixWriter(loglikes_wspec) as loglikes_writer:
def __init__( self, cfg: KaldiDecoderConfig, beam: int, nbest: int = 1, ): try: from kaldi.asr import FasterRecognizer, LatticeFasterRecognizer from kaldi.base import set_verbose_level from kaldi.decoder import ( FasterDecoder, FasterDecoderOptions, LatticeFasterDecoder, LatticeFasterDecoderOptions, ) from kaldi.lat.functions import DeterminizeLatticePhonePrunedOptions from kaldi.fstext import read_fst_kaldi, SymbolTable except: warnings.warn( "pykaldi is required for this functionality. Please install from https://github.com/pykaldi/pykaldi" ) # set_verbose_level(2) self.acoustic_scale = cfg.acoustic_scale self.nbest = nbest if cfg.hlg_graph_path is None: assert ( cfg.kaldi_initializer_config is not None ), "Must provide hlg graph path or kaldi initializer config" cfg.hlg_graph_path = initalize_kaldi(cfg.kaldi_initializer_config) assert os.path.exists(cfg.hlg_graph_path), cfg.hlg_graph_path if cfg.is_lattice: self.dec_cls = LatticeFasterDecoder opt_cls = LatticeFasterDecoderOptions self.rec_cls = LatticeFasterRecognizer else: assert self.nbest == 1, "nbest > 1 requires lattice decoder" self.dec_cls = FasterDecoder opt_cls = FasterDecoderOptions self.rec_cls = FasterRecognizer self.decoder_options = opt_cls() self.decoder_options.beam = beam self.decoder_options.max_active = cfg.max_active self.decoder_options.beam_delta = cfg.beam_delta self.decoder_options.hash_ratio = cfg.hash_ratio if cfg.is_lattice: self.decoder_options.lattice_beam = cfg.lattice_beam self.decoder_options.prune_interval = cfg.prune_interval self.decoder_options.determinize_lattice = cfg.determinize_lattice self.decoder_options.prune_scale = cfg.prune_scale det_opts = DeterminizeLatticePhonePrunedOptions() det_opts.max_mem = cfg.max_mem det_opts.phone_determinize = cfg.phone_determinize det_opts.word_determinize = cfg.word_determinize det_opts.minimize = cfg.minimize self.decoder_options.det_opts = det_opts self.output_symbols = {} with open(cfg.output_dict, "r") as f: for line in f: items = line.rstrip().split() assert len(items) == 2 self.output_symbols[int(items[1])] = items[0] logger.info(f"Loading FST from {cfg.hlg_graph_path}") self.fst = read_fst_kaldi(cfg.hlg_graph_path) self.symbol_table = SymbolTable.read_text(cfg.output_dict) self.executor = ThreadPoolExecutor(max_workers=cfg.num_threads)
# Define the decodable wrapper: (features, acoustic_scale) -> decodable def make_decodable_wrapper(trans_model, acoustic_model): def decodable_wrapper(features, acoustic_scale): return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features, acoustic_scale) return decodable_wrapper decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model) # Define the decoder decoding_graph = read_fst_kaldi( "/home/dogan/tools/pykaldi/egs/models/wsj/HCLG.fst") decoder_opts = FasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decoder = FasterDecoder(decoding_graph, decoder_opts) # Define the recognizer symbols = SymbolTable.read_text( "/home/dogan/tools/pykaldi/egs/models/wsj/words.txt") asr = Recognizer(decoder, decodable_wrapper, symbols) # Decode wave files for key, wav in SequentialWaveReader( "scp:/home/dogan/tools/pykaldi/egs/decoder/test2.scp"): feats = feat_pipeline(wav) out = asr.decode(feats) print(key, out["text"], flush=True)