def __init__(self, *args, **kwargs): super(Tester, self).__init__(*args, **kwargs) print('Initializing aspire model...') decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 self.asr = NnetLatticeFasterRecognizer.from_files( "/home/chris/git/pykaldi/examples/setups/aspire/exp/tdnn_7b_chain_online/final.mdl", "/home/chris/git/pykaldi/examples/setups/aspire/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst", "/home/chris/git/pykaldi/examples/setups/aspire/data/lang/words.txt", decoder_opts=decoder_opts, decodable_opts=decodable_opts) _, fn = tempfile.mkstemp() os.remove(fn) self.scp_fn = scp_fn = '%s.scp' % fn # Define feature pipelines as Kaldi rspecifiers self.feats_rspec = ( f"ark:compute-mfcc-feats --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/mfcc_hires.conf scp:{scp_fn} ark:- |" ) self.ivectors_rspec = ( f"ark:compute-mfcc-feats --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/mfcc_hires.conf scp:{scp_fn} ark:- |" f"ivector-extract-online2 --config=/home/chris/git/pykaldi/examples/setups/aspire/conf/ivector_extractor.conf " \ f"ark:/home/chris/git/pykaldi/examples/setups/aspire/data/test/spk2utt ark:- ark:- |" )
def __init__(self, scp, model, graph, words, conf, iconf, spk2utt, output, printed=False, log=False): """ Инициализация транскриптора Аргументы: scp: путь к .SCP файлу с аудио model: путь к .MDL файлу модели распознавания graph: путь к .FST файлу общего графа распознавания words: путь к .TXT файлу текстового корпуса conf: путь к .CONF конфигурационному файлу распознавания iconf: путь к .CONF конфигурационному файлу векторного экстрактора spk2utt: путь к файлу перечисления сегментов для каждого говорящего output: путь к директории с результатами распознавания printed: признак печати результатов распознавания log: признак логирования """ self.scp = scp self.model = model self.graph = graph self.words = words self.conf = conf self.iconf = iconf self.spk2utt = spk2utt self.output = Path(output) self.printed = printed self.log = log decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 self.asr = NnetLatticeFasterRecognizer.from_files(self.model, self.graph, self.words, decoder_opts=decoder_opts, decodable_opts=decodable_opts)
def __init__(self, scp, model, post, conf, output, log=False): """ Инициализация сегментатора Аргументы: scp: путь к .SCP файлу с аудио model: путь к .RAW файлу модели сегментации post: путь к .VEC файлу апостериорных вероятностей сегментации conf: путь к .CONF конфигурационному файлу сегментации output: путь к директории с результатами сегментации log: признак логирования """ self.scp = scp self.model = model self.post = post self.conf = conf self.output = Path(output) self.log = log sad_model = NnetSAD.read_model(model) sad_post = NnetSAD.read_average_posteriors(post) sad_transform = NnetSAD.make_sad_transform(sad_post) sad_graph = NnetSAD.make_sad_graph() decodable_opts = NnetSimpleComputationOptions() decodable_opts.extra_left_context = 79 decodable_opts.extra_right_context = 21 decodable_opts.extra_left_context_initial = 0 decodable_opts.extra_right_context_final = 0 decodable_opts.frames_per_chunk = 150 decodable_opts.acoustic_scale = 0.3 self.sad = NnetSAD(sad_model, sad_transform, sad_graph, decodable_opts=decodable_opts) self.seg = SegmentationProcessor([2])
#!/usr/bin/env python from __future__ import print_function from kaldi.asr import NnetLatticeFasterRecognizer from kaldi.decoder import LatticeFasterDecoderOptions from kaldi.nnet3 import NnetSimpleComputationOptions from kaldi.util.table import SequentialMatrixReader # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( "exp/tdnn_7b_chain_online/final.mdl", "exp/tdnn_7b_chain_online/graph_pp/HCLG.fst", "exp/tdnn_7b_chain_online/graph_pp/words.txt", decoder_opts=decoder_opts, decodable_opts=decodable_opts) # Define feature pipelines as Kaldi rspecifiers feats_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |") ivectors_rspec = ( "ark:compute-mfcc-feats --config=conf/mfcc.conf scp:data/wav.scp ark:- |" "ivector-extract-online2 --config=conf/ivector.conf ark:data/spk2utt ark:- ark:- |" )
def main(self): # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( self.dir_path + "/exp/tdnn_7b_chain_online/final.mdl", self.dir_path + "/new/graph/HCLG.fst", self.dir_path + "/new/graph/words.txt", decoder_opts=decoder_opts, decodable_opts=decodable_opts) p = pyaudio.PyAudio() # ############################################ # sentiment_analyzer = SentimentAnalyzer(self.dir_path) # model = load_model(self.dir_path + '/lstm.h5') ############################################ stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK) print("* recording") audio2send = [] cur_data = '' # current chunk of audio data rel = RATE / CHUNK slid_win = deque(maxlen=int(SILENCE_LIMIT * rel) + 1) # Prepend audio from 0.5 seconds before noise was detected prev_audio = deque(maxlen=int(PREV_AUDIO * rel) + 1) started = False n = num_phrases response = [] while num_phrases == -1 or n > 0: cur_data = stream.read(CHUNK) slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4)))) # print slid_win[-1] if sum([x > THRESHOLD for x in slid_win]) > 0: if (not started): # print "Starting record of phrase" started = True audio2send.append(cur_data) elif (started is True): # print "Finished" # The limit was reached, finish capture and deliver. filename = self.save_speech(list(prev_audio) + audio2send, p, self.SAVE_PATH, self.WAVE_OUTPUT_FILENAME) # Send file to Google and get response r = self.recognize_speech(asr) if num_phrases == -1: print("Detected speech: ", r) # if r != None: # sentiment_analyzer.get_sentiment(r, model) else: response.append(r) # Remove temp file. Comment line to review. os.remove(filename) # Reset all started = False slid_win = deque(maxlen=int(SILENCE_LIMIT * rel) + 1) prev_audio = deque(maxlen=int(0.5 * rel) + 1) audio2send = [] n -= 1 else: prev_audio.append(cur_data) stream.stop_stream() stream.close() p.terminate()
def init(self, nnet_directory, transcription_directory): return_msg = "KaldiDecoder:init" debug_data = [] feats = "" ivectors = "" decoder_opts = None decodable_opts = None asr = None ## input validation if nnet_directory is not None: if type(nnet_directory) is not str: return_msg += "nnet_directory is not of type string, is type {}".format( type(nnet_directory)) return { RDK.success: RC.input_validation, RDK.return_msg: return_msg, RDK.debug_data: debug_data } else: nnet_directory = KaldiNnetDecoder.CV_default_nnet_directory if transcription_directory is not None: if type(transcription_directory) is not str: return_msg += "transcription_directory is not of type string, is type {}".format( type(transcription_directory)) return { RDK.success: RC.input_validation, RDK.return_msg: return_msg, RDK.debug_data: debug_data } else: transcription_directory = KaldiNnetDecoder.CV_default_transcription_directory ##</end> input validation ## feats and ivector rspec creation feats = ( "ark:compute-mfcc-feats --config={0}/conf/mfcc.conf scp:{1}/wav.scp ark:- |" ).format(nnet_directory, transcription_directory) ivectors = ( "ark:compute-mfcc-feats --config={0}/conf/mfcc.conf scp:{1}/wav.scp ark:- |" "ivector-extract-online2 --config={0}/conf/ivector_extractor.conf ark:{1}/spk2utt ark:- ark:- |" ).format(nnet_directory, transcription_directory) ##</end> feats and ivector rspec creation ## asr creation decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = 13 decoder_opts.max_active = 7000 decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( "{}/final.mdl".format(nnet_directory), "{}/graph/HCLG.fst".format(nnet_directory), "{}/graph/words.txt".format(nnet_directory), decoder_opts=decoder_opts, decodable_opts=decodable_opts) ##</end> asr creation self.IV_feats = feats self.IV_ivectors = ivectors self.IV_asr = asr self.IV_is_ready = True return { RDK.success: RC.success, RDK.return_msg: return_msg, RDK.debug_data: debug_data }
def asr(filenameS_hash, filenameS, asr_beamsize=13, asr_max_active=8000): models_dir = "models/" # Read yaml File config_file = "models/kaldi_tuda_de_nnet3_chain2.yaml" with open(config_file, 'r') as stream: model_yaml = yaml.safe_load(stream) decoder_yaml_opts = model_yaml['decoder'] scp_filename = "tmp/%s.scp" % filenameS_hash wav_filename = "tmp/%s.wav" % filenameS_hash spk2utt_filename = "tmp/%s_spk2utt" % filenameS_hash # write scp file with open(scp_filename, 'w') as scp_file: scp_file.write("%s tmp/%s.wav\n" % (filenameS_hash, filenameS_hash)) # write scp file with open(spk2utt_filename, 'w') as scp_file: scp_file.write("%s %s\n" % (filenameS_hash, filenameS_hash)) # use ffmpeg to convert the input media file (any format!) to 16 kHz wav mono ( ffmpeg .input(filename) .output("tmp/%s.wav" % filenameS_hash, acodec='pcm_s16le', ac=1, ar='16k') .overwrite_output() .run() ) # Construct recognizer decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = asr_beamsize decoder_opts.max_active = asr_max_active decodable_opts = NnetSimpleComputationOptions() decodable_opts.acoustic_scale = 1.0 decodable_opts.frame_subsampling_factor = 3 decodable_opts.frames_per_chunk = 150 asr = NnetLatticeFasterRecognizer.from_files( models_dir + decoder_yaml_opts["model"], models_dir + decoder_yaml_opts["fst"], models_dir + decoder_yaml_opts["word-syms"], decoder_opts=decoder_opts, decodable_opts=decodable_opts) # Construct symbol table symbols = SymbolTable.read_text(models_dir + decoder_yaml_opts["word-syms"]) phi_label = symbols.find_index("#0") # Define feature pipelines as Kaldi rspecifiers feats_rspec = ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:- |") % \ (models_dir + decoder_yaml_opts["mfcc-config"]) ivectors_rspec = ( ("ark:compute-mfcc-feats --config=%s scp:" + scp_filename + " ark:-" + " | ivector-extract-online2 --config=%s ark:" + spk2utt_filename + " ark:- ark:- |") % ((models_dir + decoder_yaml_opts["mfcc-config"]), (models_dir + decoder_yaml_opts["ivector-extraction-config"])) ) did_decode = False # Decode wav files with SequentialMatrixReader(feats_rspec) as f, \ SequentialMatrixReader(ivectors_rspec) as i: for (fkey, feats), (ikey, ivectors) in zip(f, i): did_decode = True assert (fkey == ikey) out = asr.decode((feats, ivectors)) best_path = functions.compact_lattice_shortest_path(out["lattice"]) words, _, _ = get_linear_symbol_sequence(shortestpath(best_path)) timing = functions.compact_lattice_to_word_alignment(best_path) assert(did_decode) # Maps words to the numbers words = indices_to_symbols(symbols, timing[0]) # Creates the datastructure (Word, begin(Frames), end(Frames)) vtt = list(map(list, zip(words, timing[1], timing[2]))) # Cleanup tmp files print('removing tmp file:', scp_filename) os.remove(scp_filename) print('removing tmp file:', wav_filename) os.remove(wav_filename) print('removing tmp file:', spk2utt_filename) os.remove(spk2utt_filename) return vtt, words
# Reading from config file parser = ConfigParser() parser.read('sad_model.conf') samplerate = int(parser.get('AUDIO', 'SAMPLE_RATE')) n_channels = int(parser.get('AUDIO', 'N_CHANNELS')) encoding = parser.get('AUDIO', 'ENCODING') sad_final_raw = parser.get('SAD', 'FINAL_RAW') post_output_vec = parser.get('SAD', 'POST_OUTPUT_VEC') # Construct SAD model = NnetSAD.read_model(sad_final_raw) post = NnetSAD.read_average_posteriors(post_output_vec) transform = NnetSAD.make_sad_transform(post) graph = NnetSAD.make_sad_graph(min_silence_duration=0.1) decodable_opts = NnetSimpleComputationOptions() decodable_opts.extra_left_context = 79 decodable_opts.extra_right_context = 21 decodable_opts.extra_left_context_initial = 0 decodable_opts.extra_right_context_final = 0 decodable_opts.frames_per_chunk = 150 decodable_opts.acoustic_scale = 0.3 sad = NnetSAD(model, transform, graph, decodable_opts=decodable_opts) seg = SegmentationProcessor(target_labels=[2], min_segment_dur=3, max_merged_segment_dur=3.84) def convert_to_wav(path): ''' 1. Converts audio file in other formats to wav format