def decode_one(self, data, as_idx=False): #Reweight and reorder for LM reweighted = self.stats_state.reweight(data, self.alphaweight) reweighted = reweighted[:, self.reorder_2] reweighted_prime = np.full( (reweighted.shape[0], self.reorder_1.max() + 1), MIN_WEIGHT, dtype=np.float32) reweighted_prime[:, self.reorder_1] = reweighted #Apply LM reweighted = Matrix(reweighted_prime) decoder = FasterDecoder(self.decode_fst, self.decoder_opts) decodable = DecodableMatrixScaledMapped(self.trans_model, reweighted, self.acoustic_scale) decoder.decode(decodable) best_path = decoder.get_best_path() alignment, words, weight = get_linear_symbol_sequence(best_path) #Parse LM output kaldi_unicode = kaldi2str_single( [self.word_syms.find_symbol(w).decode('utf8') for w in words]) return kaldi_unicode, 0
def decode_one(self, logits, padding): from kaldi.matrix import Matrix decoder = self.dec_cls(self.fst, self.decoder_options) asr = self.rec_cls(decoder, self.symbol_table, acoustic_scale=self.acoustic_scale) if padding is not None: logits = logits[~padding] mat = Matrix(logits.numpy()) out = asr.decode(mat) if self.nbest > 1: from kaldi.fstext import shortestpath from kaldi.fstext.utils import ( convert_compact_lattice_to_lattice, convert_lattice_to_std, convert_nbest_to_list, get_linear_symbol_sequence, ) lat = out["lattice"] sp = shortestpath(lat, nshortest=self.nbest) sp = convert_compact_lattice_to_lattice(sp) sp = convert_lattice_to_std(sp) seq = convert_nbest_to_list(sp) results = [] for s in seq: _, o, w = get_linear_symbol_sequence(s) words = list(self.output_symbols[z] for z in o) results.append({ "tokens": words, "words": words, "score": w.value, "emissions": logits, }) return results else: words = out["text"].split() return [{ "tokens": words, "words": words, "score": out["likelihood"], "emissions": logits, }]
def gmm_decode_faster(model_rxfilename, fst_rxfilename, feature_rspecifier, words_wspecifier, alignment_wspecifier="", lattice_wspecifier="", word_symbol_table="", acoustic_scale=0.1, allow_partial=True, decoder_opts=FasterDecoderOptions()): # Read model. trans_model = TransitionModel() am_gmm = AmDiagGmm() with xopen(model_rxfilename) as ki: trans_model.read(ki.stream(), ki.binary) am_gmm.read(ki.stream(), ki.binary) # Open table readers/writers. feature_reader = SequentialMatrixReader(feature_rspecifier) words_writer = IntVectorWriter(words_wspecifier) alignment_writer = IntVectorWriter(alignment_wspecifier) clat_writer = CompactLatticeWriter(lattice_wspecifier) # Read symbol table. word_syms = None if word_symbol_table != "": word_syms = SymbolTable.read_text(word_symbol_table) if not word_syms: raise RuntimeError("Could not read symbol table from file {}" .format(word_symbol_table)) # NOTE: # It is important to read decode_fst after opening feature reader as # it can prevent crashes on systems without enough virtual memory. # Read decoding graph and instantiate decoder. decode_fst = read_fst_kaldi(fst_rxfilename) decoder = FasterDecoder(decode_fst, decoder_opts) tot_like = 0.0 frame_count = 0 num_success, num_fail = 0, 0 start = time.time() for key, features in feature_reader: if features.num_rows == 0: num_fail += 1 logging.warning("Zero-length utterance: {}".format(key)) continue gmm_decodable = DecodableAmDiagGmmScaled(am_gmm, trans_model, features, acoustic_scale) decoder.decode(gmm_decodable) if not (allow_partial or decoder.reached_final()): num_fail += 1 logging.warning("Did not successfully decode utterance {}, len = {}" .format(key, features.num_rows)) continue try: best_path = decoder.get_best_path() except RuntimeError: num_fail += 1 logging.warning("Did not successfully decode utterance {}, len = {}" .format(key, features.num_rows)) continue if not decoder.reached_final(): logging.warning("Decoder did not reach end-state, outputting " "partial traceback since --allow-partial=true") ali, words, weight = get_linear_symbol_sequence(best_path) words_writer[key] = words if alignment_writer.is_open(): alignment_writer[key] = ali if clat_writer.is_open(): if acoustic_scale != 0.0: scale = acoustic_lattice_scale(1.0 / acoustic_scale) scale_lattice(scale, best_path) best_path = convert_lattice_to_compact_lattice(best_path) clat_writer[key] = best_path if word_syms: syms = convert_indices_to_symbols(word_syms, words) print(key, " ".join(syms), file=sys.stderr) num_success += 1 frame_count += features.num_rows like = - (weight.value1 + weight.value2); tot_like += like logging.info("Log-like per frame for utterance {} is {} over {} " "frames.".format(key, like / features.num_rows, features.num_rows)) logging.debug("Cost for utterance {} is {} + {}" .format(key, weight.value1, weight.value2)) elapsed = time.time() - start logging.info("Time taken [excluding initialization] {}s: real-time factor " "assuming 100 frames/sec is {}" .format(elapsed, elapsed * 100 / frame_count)) logging.info("Done {} utterances, failed for {}" .format(num_success, num_fail)) logging.info("Overall log-likelihood per frame is {} over {} frames." .format(tot_like / frame_count, frame_count)) feature_reader.close() words_writer.close() if alignment_writer.is_open(): alignment_writer.close() if clat_writer.is_open(): clat_writer.close() return True if num_success != 0 else False
def asr(filenameS_hash, filenameS, asr_beamsize=13, asr_max_active=8000): models_dir = "models/" # Read yaml File config_file = "models/kaldi_tuda_de_nnet3_chain2.yaml" with open(config_file, 'r') as stream: model_yaml = yaml.safe_load(stream) decoder_yaml_opts = model_yaml['decoder'] scp_filename = "tmp/%s.scp" % filenameS_hash wav_filename = "tmp/%s.wav" % filenameS_hash spk2utt_filename = "tmp/%s_spk2utt" % filenameS_hash # write scp file with open(scp_filename, 'w') as scp_file: scp_file.write("%s tmp/%s.wav\n" % (filenameS_hash, filenameS_hash)) # write scp file with open(spk2utt_filename, 'w') as scp_file: scp_file.write("%s %s\n" % (filenameS_hash, filenameS_hash)) # use ffmpeg to convert the input media file (any format!) to 16 kHz wav mono ( ffmpeg .input(filename) .output("tmp/%s.wav" % filenameS_hash, acodec='pcm_s16le', ac=1, ar='16k') .overwrite_output() .run() ) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = asr_beamsize decoder_opts.max_active = asr_max_active decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( models_dir + decoder_yaml_opts["model"], models_dir + decoder_yaml_opts["fst"], models_dir + decoder_yaml_opts["word-syms"], decoder_opts=decoder_opts, decodable_opts=decodable_opts) # Construct symbol table symbols = SymbolTable.read_text(models_dir + decoder_yaml_opts["word-syms"]) phi_label = symbols.find_index("#0") # Define feature pipelines as Kaldi rspecifiers feats_rspec = ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:- |") % \ (models_dir + decoder_yaml_opts["mfcc-config"]) ivectors_rspec = ( ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:-" + " | ivector-extract-online2 --config=%s ark:" + spk2utt_filename + " ark:- ark:- |") % ((models_dir + decoder_yaml_opts["mfcc-config"]), (models_dir + decoder_yaml_opts["ivector-extraction-config"])) ) did_decode = False # Decode wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i: for (fkey, feats), (ikey, ivectors) in zip(f, i): did_decode = True assert (fkey == ikey) out = asr.decode((feats, ivectors)) best_path = functions.compact_lattice_shortest_path(out["lattice"]) words, _, _ = get_linear_symbol_sequence(shortestpath(best_path)) timing = functions.compact_lattice_to_word_alignment(best_path) assert(did_decode) # Maps words to the numbers words = indices_to_symbols(symbols, timing[0]) # Creates the datastructure (Word, begin(Frames), end(Frames)) vtt = list(map(list, zip(words, timing[1], timing[2]))) # Cleanup tmp files print('removing tmp file:', scp_filename) os.remove(scp_filename) print('removing tmp file:', wav_filename) os.remove(wav_filename) print('removing tmp file:', spk2utt_filename) os.remove(spk2utt_filename) return vtt, words
rnnlm_opts.eos_index = symbols.find_index("</s>") rnnlm_opts.brk_index = symbols.find_index("<brk>") compose_opts = ComposeLatticePrunedOptions() compose_opts.lattice_compose_beam = 4 rescorer = LatticeRnnlmPrunedRescorer.from_files( "lm/G.carpa", "rnnlm-get-word-embedding lm/word_feats.txt lm/feat_embedding.final.mat -|", "lm/final.raw", acoustic_scale=1.0, max_ngram_order=4, use_const_arpa=True, opts=rnnlm_opts, compose_opts=compose_opts) # Define feature pipelines as Kaldi rspecifiers feats_rspec = "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:- |" ivectors_rspec = ( "ark:compute-mfcc-feats --config=mfcc.conf scp:wav.scp ark:-" " | ivector-extract-online2 --config=ivector.conf ark:spk2utt ark:- ark:- |" ) # Decode wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i: for (fkey, feats), (ikey, ivectors) in zip(f, i): assert (fkey == ikey) out = asr.decode((feats, ivectors)) rescored_lat = rescorer.rescore(out["lattice"]) words, _, _ = get_linear_symbol_sequence(shortestpath(rescored_lat)) print(fkey, " ".join(indices_to_symbols(symbols, words)), flush=True)