def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) print('Initializing aspire model...') decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 self.asr = NnetLatticeFasterRecognizer.from_files( "/home/chris/git/pykaldi/examples/setups/aspire/exp/tdnn_7b_chain_online/final.mdl", "/home/chris/git/pykaldi/examples/setups/aspire/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst", "/home/chris/git/pykaldi/examples/setups/aspire/data/lang/words.txt", decoder_opts=decoder_opts, decodable_opts=decodable_opts) _, fn = tempfile.mkstemp() os.remove(fn) self.scp_fn = scp_fn = '%s.scp' % fn # Define feature pipelines as Kaldi rspecifiers self.feats_rspec = ( f"ark:compute-mfcc-feats --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/mfcc_hires.conf scp:{scp_fn} ark:- |" ) self.ivectors_rspec = ( f"ark:compute-mfcc-feats --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/mfcc_hires.conf scp:{scp_fn} ark:- |" f"ivector-extract-online2 --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/ivector_extractor.conf " \ f"ark:/home/chris/git/pykaldi/examples/setups/aspire/data/test/spk2utt ark:- ark:- |" )
def __init__(self, scp, model, graph, words, conf, iconf, spk2utt, output, printed=False, log=False): """ Инициализация транскриптора Аргументы: scp: путь к .SCP файлу с аудио model: путь к .MDL файлу модели распознавания graph: путь к .FST файлу общего графа распознавания words: путь к .TXT файлу текстового корпуса conf: путь к .CONF конфигурационному файлу распознавания iconf: путь к .CONF конфигурационному файлу векторного экстрактора spk2utt: путь к файлу перечисления сегментов для каждого говорящего output: путь к директории с результатами распознавания printed: признак печати результатов распознавания log: признак логирования """ self.scp = scp self.model = model self.graph = graph self.words = words self.conf = conf self.iconf = iconf self.spk2utt = spk2utt self.output = Path(output) self.printed = printed self.log = log decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 self.asr = NnetLatticeFasterRecognizer.from_files(self.model, self.graph, self.words, decoder_opts=decoder_opts, decodable_opts=decodable_opts)
def load_model(config_file, online_config, models_path='models/', beam_size=10, frames_per_chunk=50): # Read YAML file with open(config_file, 'r') as stream: model_yaml = yaml.safe_load(stream) decoder_yaml_opts = model_yaml['decoder'] print(decoder_yaml_opts) feat_opts = OnlineNnetFeaturePipelineConfig() endpoint_opts = OnlineEndpointConfig() if not os.path.isfile(online_config): print(online_config + ' does not exists. Trying to create it from yaml file settings.') print( 'See also online_config_options.info.txt for what possible settings are.' ) with open(online_config, 'w') as online_config_file: online_config_file.write("--add_pitch=False\n") online_config_file.write("--mfcc_config=" + models_path + decoder_yaml_opts['mfcc-config'] + "\n") online_config_file.write("--feature_type=mfcc\n") online_config_file.write( "--ivector_extraction_config=" + models_path + decoder_yaml_opts['ivector-extraction-config'] + '\n') online_config_file.write( "--endpoint.silence-phones=" + decoder_yaml_opts['endpoint-silence-phones'] + '\n') else: print("Loading online conf from:", online_config) po = ParseOptions("") feat_opts.register(po) endpoint_opts.register(po) po.read_config_file(online_config) feat_info = OnlineNnetFeaturePipelineInfo.from_config(feat_opts) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = beam_size decoder_opts.max_active = 7000 decodable_opts = NnetSimpleLoopedComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = frames_per_chunk asr = NnetLatticeFasterOnlineRecognizer.from_files( models_path + decoder_yaml_opts["model"], models_path + decoder_yaml_opts["fst"], models_path + decoder_yaml_opts["word-syms"], decoder_opts=decoder_opts, decodable_opts=decodable_opts, endpoint_opts=endpoint_opts) return asr, feat_info, decodable_opts
def __initialize_decoder(self): #set decoding options (same as archive/config/decode.conf) decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13.0 decoder_opts.lattice_beam = 6.0 # decoder_opts.max_active = 7000 # Construct recognizer asr = GmmLatticeFasterRecognizer.from_files( os.path.join(self.MODEL_DIR, "final.mdl"), os.path.join(self.MODEL_DIR, "graph", "HCLG.fst"), os.path.join(self.MODEL_DIR, "graph", "words.txt"), decoder_opts=decoder_opts) return asr
def LoadModels(self): try: # Define online feature pipeline po = ParseOptions("") decoder_opts = LatticeFasterDecoderOptions() self.endpoint_opts = OnlineEndpointConfig() self.decodable_opts = NnetSimpleLoopedComputationOptions() feat_opts = OnlineNnetFeaturePipelineConfig() decoder_opts.register(po) self.endpoint_opts.register(po) self.decodable_opts.register(po) feat_opts.register(po) po.read_config_file(self.CONFIG_FILES_PATH + "/online.conf") self.feat_info = OnlineNnetFeaturePipelineInfo.from_config( feat_opts) # Set metadata parameters self.samp_freq = self.feat_info.mfcc_opts.frame_opts.samp_freq self.frame_shift = self.feat_info.mfcc_opts.frame_opts.frame_shift_ms / 1000 self.acwt = self.decodable_opts.acoustic_scale # Load Acoustic and graph models and other files self.transition_model, self.acoustic_model = NnetRecognizer.read_model( self.AM_PATH + "/final.mdl") graph = _fst.read_fst_kaldi(self.LM_PATH + "/HCLG.fst") self.decoder_graph = LatticeFasterOnlineDecoder( graph, decoder_opts) self.symbols = _fst.SymbolTable.read_text(self.LM_PATH + "/words.txt") self.info = WordBoundaryInfo.from_file( WordBoundaryInfoNewOpts(), self.LM_PATH + "/word_boundary.int") self.asr = NnetLatticeFasterOnlineRecognizer( self.transition_model, self.acoustic_model, self.decoder_graph, self.symbols, decodable_opts=self.decodable_opts, endpoint_opts=self.endpoint_opts) del graph, decoder_opts except Exception as e: self.log.error(e) raise ValueError( "AM and LM loading failed!!! (see logs for more details)")
#!/usr/bin/env python from __future__ import print_function from kaldi.asr import NnetLatticeFasterRecognizer from kaldi.decoder import LatticeFasterDecoderOptions from kaldi.nnet3 import NnetSimpleComputationOptions from kaldi.util.table import SequentialMatrixReader # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( "exp/tdnn_7b_chain_online/final.mdl", "exp/tdnn_7b_chain_online/graph_pp/HCLG.fst", "exp/tdnn_7b_chain_online/graph_pp/words.txt", decoder_opts=decoder_opts, decodable_opts=decodable_opts) # Define feature pipelines as Kaldi rspecifiers feats_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |") ivectors_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |" "ivector-extract-online2 --config=conf/ivector.conf ark:data/spk2utt ark:- ark:- |" )
def main(self): # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( self.dir_path + "/exp/tdnn_7b_chain_online/final.mdl", self.dir_path + "/new/graph/HCLG.fst", self.dir_path + "/new/graph/words.txt", decoder_opts=decoder_opts, decodable_opts=decodable_opts) p = pyaudio.PyAudio() # ############################################ # sentiment_analyzer = SentimentAnalyzer(self.dir_path) # model = load_model(self.dir_path + '/lstm.h5') ############################################ stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) print("* recording") audio2send = [] cur_data = '' # current chunk of audio data rel = RATE / CHUNK slid_win = deque(maxlen=int(SILENCE_LIMIT * rel) + 1) # Prepend audio from 0.5 seconds before noise was detected prev_audio = deque(maxlen=int(PREV_AUDIO * rel) + 1) started = False n = num_phrases response = [] while num_phrases == -1 or n > 0: cur_data = stream.read(CHUNK) slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4)))) # print slid_win[-1] if sum([x > THRESHOLD for x in slid_win]) > 0: if (not started): # print "Starting record of phrase" started = True audio2send.append(cur_data) elif (started is True): # print "Finished" # The limit was reached, finish capture and deliver. filename = self.save_speech(list(prev_audio) + audio2send, p, self.SAVE_PATH, self.WAVE_OUTPUT_FILENAME) # Send file to Google and get response r = self.recognize_speech(asr) if num_phrases == -1: print("Detected speech: ", r) # if r != None: # sentiment_analyzer.get_sentiment(r, model) else: response.append(r) # Remove temp file. Comment line to review. os.remove(filename) # Reset all started = False slid_win = deque(maxlen=int(SILENCE_LIMIT * rel) + 1) prev_audio = deque(maxlen=int(0.5 * rel) + 1) audio2send = [] n -= 1 else: prev_audio.append(cur_data) stream.stop_stream() stream.close() p.terminate()
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-data", help="data yaml file") parser.add_argument("-data_path", default='', type=str, help="path of data files") parser.add_argument("-seed_model", help="the seed nerual network model") parser.add_argument("-exp_dir", help="the directory to save the outputs") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument("-criterion", type=str, choices=["mmi", "mpfe", "smbr"], help="set the sequence training crtierion") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument( "-prior_path", help="the prior for decoder, usually named as final.occs in kaldi setup" ) parser.add_argument( "-den_dir", help="the decoding graph directory to find HCLG and words.txt files") parser.add_argument("-lr", type=float, help="set the learning rate") parser.add_argument("-ce_ratio", default=0.1, type=float, help="the ratio for ce regularization") parser.add_argument("-momentum", default=0, type=float, help="set the momentum") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=100, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument('-print_freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('-save_freq', default=1000, type=int, metavar='N', help='save model frequency (default: 1000)') args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config['data_path'] = args.data_path config["sweep_size"] = args.sweep_size print("pytorch version:{}".format(th.__version__)) with open(args.data) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) dataset = SpeechDataset(config) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset.transform = transform train_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, num_workers=args.data_loader_threads, distributed=True, test_only=False) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) model.cuda() # setup the optimizer optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) if os.path.isfile(args.seed_model): checkpoint = th.load(args.seed_model) state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v model.load_state_dict(new_state_dict) print("=> loaded checkpoint '{}' ".format(args.seed_model)) else: sys.stderr.write('ERROR: The model file %s does not exist!\n' % (model_file)) sys.exit(0) HCLG = args.den_dir + "/HCLG.fst" words_txt = args.den_dir + "/words.txt" silence_phones = args.den_dir + "/phones/silence.csl" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if not os.path.isfile(silence_phones): sys.stderr.write('ERROR: The silence phone file %s does not exist!\n' % (silence_phones)) sys.exit(0) with open(silence_phones) as f: silence_ids = [int(i) for i in f.readline().strip().split(':')] f.close() if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = False #To produce raw state-level lattice instead of compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) prior = kaldi_util.io.read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) model.train() for epoch in range(args.num_epochs): run_train_epoch(model, optimizer, log_prior.cuda(), train_dataloader, epoch, asr_decoder, trans_model, silence_ids, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.se.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
def init(self, nnet_directory, transcription_directory): return_msg = "KaldiDecoder:init" debug_data = [] feats = "" ivectors = "" decoder_opts = None decodable_opts = None asr = None ## input validation if nnet_directory is not None: if type(nnet_directory) is not str: return_msg += "nnet_directory is not of type string, is type {}".format( type(nnet_directory)) return { RDK.success: RC.input_validation, RDK.return_msg: return_msg, RDK.debug_data: debug_data } else: nnet_directory = KaldiNnetDecoder.CV_default_nnet_directory if transcription_directory is not None: if type(transcription_directory) is not str: return_msg += "transcription_directory is not of type string, is type {}".format( type(transcription_directory)) return { RDK.success: RC.input_validation, RDK.return_msg: return_msg, RDK.debug_data: debug_data } else: transcription_directory = KaldiNnetDecoder.CV_default_transcription_directory ##</end> input validation ## feats and ivector rspec creation feats = ( "ark:compute-mfcc-feats --config={0}/conf/mfcc.conf scp:{1}/wav.scp ark:- |" ).format(nnet_directory, transcription_directory) ivectors = ( "ark:compute-mfcc-feats --config={0}/conf/mfcc.conf scp:{1}/wav.scp ark:- |" "ivector-extract-online2 --config={0}/conf/ivector_extractor.conf ark:{1}/spk2utt ark:- ark:- |" ).format(nnet_directory, transcription_directory) ##</end> feats and ivector rspec creation ## asr creation decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( "{}/final.mdl".format(nnet_directory), "{}/graph/HCLG.fst".format(nnet_directory), "{}/graph/words.txt".format(nnet_directory), decoder_opts=decoder_opts, decodable_opts=decodable_opts) ##</end> asr creation self.IV_feats = feats self.IV_ivectors = ivectors self.IV_asr = asr self.IV_is_ready = True return { RDK.success: RC.success, RDK.return_msg: return_msg, RDK.debug_data: debug_data }
def asr(filenameS_hash, filenameS, asr_beamsize=13, asr_max_active=8000): models_dir = "models/" # Read yaml File config_file = "models/kaldi_tuda_de_nnet3_chain2.yaml" with open(config_file, 'r') as stream: model_yaml = yaml.safe_load(stream) decoder_yaml_opts = model_yaml['decoder'] scp_filename = "tmp/%s.scp" % filenameS_hash wav_filename = "tmp/%s.wav" % filenameS_hash spk2utt_filename = "tmp/%s_spk2utt" % filenameS_hash # write scp file with open(scp_filename, 'w') as scp_file: scp_file.write("%s tmp/%s.wav\n" % (filenameS_hash, filenameS_hash)) # write scp file with open(spk2utt_filename, 'w') as scp_file: scp_file.write("%s %s\n" % (filenameS_hash, filenameS_hash)) # use ffmpeg to convert the input media file (any format!) to 16 kHz wav mono ( ffmpeg .input(filename) .output("tmp/%s.wav" % filenameS_hash, acodec='pcm_s16le', ac=1, ar='16k') .overwrite_output() .run() ) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = asr_beamsize decoder_opts.max_active = asr_max_active decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( models_dir + decoder_yaml_opts["model"], models_dir + decoder_yaml_opts["fst"], models_dir + decoder_yaml_opts["word-syms"], decoder_opts=decoder_opts, decodable_opts=decodable_opts) # Construct symbol table symbols = SymbolTable.read_text(models_dir + decoder_yaml_opts["word-syms"]) phi_label = symbols.find_index("#0") # Define feature pipelines as Kaldi rspecifiers feats_rspec = ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:- |") % \ (models_dir + decoder_yaml_opts["mfcc-config"]) ivectors_rspec = ( ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:-" + " | ivector-extract-online2 --config=%s ark:" + spk2utt_filename + " ark:- ark:- |") % ((models_dir + decoder_yaml_opts["mfcc-config"]), (models_dir + decoder_yaml_opts["ivector-extraction-config"])) ) did_decode = False # Decode wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i: for (fkey, feats), (ikey, ivectors) in zip(f, i): did_decode = True assert (fkey == ikey) out = asr.decode((feats, ivectors)) best_path = functions.compact_lattice_shortest_path(out["lattice"]) words, _, _ = get_linear_symbol_sequence(shortestpath(best_path)) timing = functions.compact_lattice_to_word_alignment(best_path) assert(did_decode) # Maps words to the numbers words = indices_to_symbols(symbols, timing[0]) # Creates the datastructure (Word, begin(Frames), end(Frames)) vtt = list(map(list, zip(words, timing[1], timing[2]))) # Cleanup tmp files print('removing tmp file:', scp_filename) os.remove(scp_filename) print('removing tmp file:', wav_filename) os.remove(wav_filename) print('removing tmp file:', spk2utt_filename) os.remove(spk2utt_filename) return vtt, words
acoustic_model = AmDiagGmm().read(ki.stream(), ki.binary) # Define the decodable wrapper: (features, acoustic_scale) -> decodable def make_decodable_wrapper(trans_model, acoustic_model): def decodable_wrapper(features, acoustic_scale): return DecodableAmDiagGmmScaled(acoustic_model, trans_model, features, acoustic_scale) return decodable_wrapper decodable_wrapper = make_decodable_wrapper(trans_model, acoustic_model) # Define the decoder decoding_graph = read_fst_kaldi("models/mono/graph/HCLG.fst") decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13.0 decoder_opts.lattice_beam = 6.0 decoder = LatticeFasterDecoder(decoding_graph, decoder_opts) # Define the recognizer symbols = SymbolTable.read_text("models/mono/graph/words.txt") asr = Recognizer(decoder, decodable_wrapper, symbols) # Decode wave files # for key, wav in SequentialWaveReader("scp:wav.scp"): # feats = feat_pipeline(wav) # out = asr.decode(feats) # print(key, out["text"], flush=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-model_path") parser.add_argument("-data_path") parser.add_argument("-prior_path", help="the path to load the final.occs file") parser.add_argument("-out_file", help="write out the log-probs to this file") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument("-graph_dir", help="the decoding graph directory") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-data_loader_threads", default=4, type=int, help="number of workers for data loading") args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size config["source_paths"] = list() data_config = dict() data_config["type"] = "Eval" data_config["wav"] = args.data_path config["source_paths"].append(data_config) print("job starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset = SpeechDataset(config) #data = trainset.__getitem__(0) test_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, test_only=True, global_mvn=True, transform=transform) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(test_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) device = th.device("cuda" if th.cuda.is_available() else "cpu") model.cuda() assert os.path.isfile( args.model_path), "ERROR: model file {} does not exit!".format( args.model_path) checkpoint = th.load(args.model_path, map_location='cuda:0') state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): header = k[:7] name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v if header == "module.": model.load_state_dict(new_state_dict) else: model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.model_path)) HCLG = args.graph_dir + "/HCLG.fst" words_txt = args.graph_dir + "/words.txt" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) prior = read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = True #To produce compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) model.eval() with th.no_grad(): with kaldi_util.table.CompactLatticeWriter("ark:" + args.out_file) as lat_out: for data in test_dataloader: feat = data["x"] num_frs = data["num_frs"] utt_ids = data["utt_ids"] x = feat.to(th.float32) x = x.cuda() prediction = model(x) for j in range(len(num_frs)): loglikes = prediction[j, :, :].data.cpu() loglikes_j = loglikes[:num_frs[j], :] loglikes_j = loglikes_j - log_prior decoder_out = asr_decoder.decode( kaldi_matrix.Matrix(loglikes_j.numpy())) key = utt_ids[j][0] print(key, decoder_out["text"]) print("Log-like per-frame for utterance {} is {}".format( key, decoder_out["likelihood"] / num_frs[j])) # save lattice lat_out[key] = decoder_out["lattice"]