def initNnetFeatPipeline(adaptation_state, asr, decodable_opts, feat_info): feat_pipeline = OnlineNnetFeaturePipeline(feat_info) feat_pipeline.set_adaptation_state(adaptation_state) asr.set_input_pipeline(feat_pipeline) asr.init_decoding() sil_weighting = OnlineSilenceWeighting( asr.transition_model, feat_info.silence_weighting_config, decodable_opts.frame_subsampling_factor) return feat_pipeline, sil_weighting
def compute_feat(self, wav): try: feat_pipeline = OnlineNnetFeaturePipeline(self.feat_info) feat_pipeline.accept_waveform(self.samp_freq, wav) feat_pipeline.input_finished() except Exception as e: self.log.error(e) raise ValueError("Feature extraction failed!!!") else: return feat_pipeline
def decode_chunked_partial(scp): ## Decode (whole utterance) #for key, wav in SequentialWaveReader("scp:wav.scp"): # feat_pipeline = OnlineNnetFeaturePipeline(feat_info) # asr.set_input_pipeline(feat_pipeline) # feat_pipeline.accept_waveform(wav.samp_freq, wav.data()[0]) # feat_pipeline.input_finished() # out = asr.decode() # print(key, out["text"], flush=True) # Decode (chunked + partial output) for key, wav in SequentialWaveReader("scp:wav.scp"): feat_pipeline = OnlineNnetFeaturePipeline(feat_info) asr.set_input_pipeline(feat_pipeline) asr.init_decoding() data = wav.data()[0] last_chunk = False part = 1 prev_num_frames_decoded = 0 for i in range(0, len(data), chunk_size): if i + chunk_size >= len(data): last_chunk = True feat_pipeline.accept_waveform(wav.samp_freq, data[i:i + chunk_size]) if last_chunk: feat_pipeline.input_finished() asr.advance_decoding() num_frames_decoded = asr.decoder.num_frames_decoded() if not last_chunk: if num_frames_decoded > prev_num_frames_decoded: prev_num_frames_decoded = num_frames_decoded out = asr.get_partial_output() print(key + "-part%d" % part, out["text"], flush=True) part += 1 asr.finalize_decoding() out = asr.get_output() print(key + "-final", out["text"], flush=True)
def decode_chunked_partial_endpointing(asr, feat_info, decodable_opts, scp, chunk_size=1024, compute_confidences=True, asr_client=None, speaker="Speaker", pad_confidences=True): # Decode (chunked + partial output + endpointing # + ivector adaptation + silence weighting) adaptation_state = OnlineIvectorExtractorAdaptationState.from_info( feat_info.ivector_extractor_info) for key, wav in SequentialWaveReader(scp): feat_pipeline = OnlineNnetFeaturePipeline(feat_info) feat_pipeline.set_adaptation_state(adaptation_state) asr.set_input_pipeline(feat_pipeline) asr.init_decoding() sil_weighting = OnlineSilenceWeighting( asr.transition_model, feat_info.silence_weighting_config, decodable_opts.frame_subsampling_factor) data = wav.data()[0] print("type(data):", type(data)) last_chunk = False utt, part = 1, 1 prev_num_frames_decoded, offset = 0, 0 for i in range(0, len(data), chunk_size): if i + chunk_size >= len(data): last_chunk = True feat_pipeline.accept_waveform(wav.samp_freq, data[i:i + chunk_size]) if last_chunk: feat_pipeline.input_finished() if sil_weighting.active(): sil_weighting.compute_current_traceback(asr.decoder) feat_pipeline.ivector_feature().update_frame_weights( sil_weighting.get_delta_weights( feat_pipeline.num_frames_ready())) asr.advance_decoding() num_frames_decoded = asr.decoder.num_frames_decoded() if not last_chunk: if asr.endpoint_detected(): asr.finalize_decoding() out = asr.get_output() mbr = MinimumBayesRisk(out["lattice"]) confd = mbr.get_one_best_confidences() if pad_confidences: token_length = len(out["text"].split()) # computed confidences array is smaller than the actual token length, if len(confd) < token_length: print( "WARNING: less computeted confidences than token length! Fixing this with padding!" ) confd = np.pad(confd, [0, token_length - len(confd)], mode='constant', constant_values=1.0) elif len(confd) > token_length: print( "WARNING: more computeted confidences than token length! Fixing this with slicing!" ) confd = confd[:token_length] print(confd) # print(key + "-utt%d-final" % utt, out["text"], flush=True) if asr_client is not None: asr_client.completeUtterance( utterance=out["text"], key=key + "-utt%d-part%d" % (utt, part), confidences=confd) offset += int(num_frames_decoded * decodable_opts.frame_subsampling_factor * feat_pipeline.frame_shift_in_seconds() * wav.samp_freq) feat_pipeline.get_adaptation_state(adaptation_state) feat_pipeline = OnlineNnetFeaturePipeline(feat_info) feat_pipeline.set_adaptation_state(adaptation_state) asr.set_input_pipeline(feat_pipeline) asr.init_decoding() sil_weighting = OnlineSilenceWeighting( asr.transition_model, feat_info.silence_weighting_config, decodable_opts.frame_subsampling_factor) remainder = data[offset:i + chunk_size] feat_pipeline.accept_waveform(wav.samp_freq, remainder) utt += 1 part = 1 prev_num_frames_decoded = 0 elif num_frames_decoded > prev_num_frames_decoded: prev_num_frames_decoded = num_frames_decoded out = asr.get_partial_output() # print(key + "-utt%d-part%d" % (utt, part), # out["text"], flush=True) if asr_client is not None: asr_client.partialUtterance(utterance=out["text"], key=key + "-utt%d-part%d" % (utt, part)) part += 1 asr.finalize_decoding() out = asr.get_output() mbr = MinimumBayesRisk(out["lattice"]) confd = mbr.get_one_best_confidences() print(out) # print(key + "-utt%d-final" % utt, out["text"], flush=True) if asr_client is not None: asr_client.completeUtterance(utterance=out["text"], key=key + "-utt%d-part%d" % (utt, part), confidences=confd) feat_pipeline.get_adaptation_state(adaptation_state)
decoder_opts.max_active = 7000 decodable_opts = NnetSimpleLoopedComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterOnlineRecognizer.from_files( "final.mdl", "HCLG.fst", "words.txt", decoder_opts=decoder_opts, decodable_opts=decodable_opts, endpoint_opts=endpoint_opts) # Decode (whole utterance) for key, wav in SequentialWaveReader("scp:wav.scp"): feat_pipeline = OnlineNnetFeaturePipeline(feat_info) asr.set_input_pipeline(feat_pipeline) feat_pipeline.accept_waveform(wav.samp_freq, wav.data()[0]) feat_pipeline.input_finished() out = asr.decode() print(key, out["text"], flush=True) # Decode (chunked + partial output) for key, wav in SequentialWaveReader("scp:wav.scp"): feat_pipeline = OnlineNnetFeaturePipeline(feat_info) asr.set_input_pipeline(feat_pipeline) asr.init_decoding() data = wav.data()[0] last_chunk = False part = 1 prev_num_frames_decoded = 0
]) audio_transcripts.sort_values(by=0) else: audio_transcripts = pd.concat([ pd.read_csv(text_path, header=None, engine='python') for text_path in text_pathes ]) audio_transcripts.sort_values(by=0) audio_transcripts = audio_transcripts[0].str.split(" ", 1, expand=True) audio_transcripts[1] = audio_transcripts[1].str.lower() audio_transcripts = audio_transcripts.set_index(0)[1].to_dict() # Decode (whole utterance) num_of_audiofiles = 0 for key, wav in SequentialWaveReader("scp:" + scp_path): feat_pipeline = OnlineNnetFeaturePipeline(feat_info) asr.set_input_pipeline(feat_pipeline) feat_pipeline.accept_waveform(wav.samp_freq, wav.data()[0]) feat_pipeline.input_finished() audio_path = key try: audio, fs = sf.read(audio_path, dtype='int16') except: if VERBOSE: print("# WARNING :: Audio File" + audio_path + " not readable.\n") log_file.write("# WARNING :: Audio File " + audio_path + " not readable.\n") continue audio_len = len(audio) / fs print('Running inference.\n', file=sys.stderr)
decodable_opts = NnetSimpleLoopedComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 50 ## smallish to force many updates asr = NnetLatticeFasterOnlineRecognizer.from_files( "final.mdl", "HCLG.fst", "words.txt", decoder_opts=decoder_opts, decodable_opts=decodable_opts, endpoint_opts=endpoint_opts) # Decode (chunked + partial output + log_likelihoods) with MatrixWriter("ark:loglikes.ark") as llout: for key, wav in SequentialWaveReader("scp:wav.scp"): feat_pipeline = OnlineNnetFeaturePipeline(feat_info) asr.set_input_pipeline(feat_pipeline) d = asr._decodable asr.init_decoding() data = wav.data()[0] last_chunk = False part = 1 prev_num_frames_decoded = 0 prev_num_frames_computed = 0 llhs = list() for i in range(0, len(data), chunk_size): if i + chunk_size >= len(data): last_chunk = True feat_pipeline.accept_waveform(wav.samp_freq, data[i:i + chunk_size]) if last_chunk: