def transcribe(self, wavfile, progress_cb=None): wav_obj = wave.open(wavfile, 'r') duration = wav_obj.getnframes() / float(wav_obj.getframerate()) n_chunks = int( math.ceil(duration / float(self.chunk_len - self.overlap_t))) chunks = [] def transcribe_chunk(idx): wav_obj = wave.open(wavfile, 'r') start_t = idx * (self.chunk_len - self.overlap_t) # Seek wav_obj.setpos(int(start_t * wav_obj.getframerate())) # Read frames buf = wav_obj.readframes( int(self.chunk_len * wav_obj.getframerate())) k = self.kaldi_queue.get() k.push_chunk(buf) ret = k.get_final() k.reset() self.kaldi_queue.put(k) chunks.append({"start": start_t, "words": ret}) logging.info('%d/%d' % (len(chunks), n_chunks)) if progress_cb is not None: progress_cb({ "message": ' '.join([X['word'] for X in ret]), "percent": len(chunks) / float(n_chunks) }) pool = Pool(min(n_chunks, self.nthreads)) pool.map(transcribe_chunk, range(n_chunks)) pool.close() chunks.sort(key=lambda x: x['start']) # Combine chunks words = [] for c in chunks: chunk_start = c['start'] for wd in c['words']: wd['start'] += chunk_start words.append(transcription.Word(**wd)) # Remove overlap: Sort by time, then filter out any Word entries in # the list that are adjacent to another entry corresponding to the same # word in the audio. words.sort(key=lambda word: word.start) words.append(transcription.Word(word="__dummy__")) words = [ words[i] for i in range(len(words) - 1) if not words[i].corresponds(words[i + 1]) ] return words
def realign(chunk): wav_obj = wave.open(wavfile, 'r') if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = chunk["end"].start duration = end_t - start_t if duration < 0.01 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration)) return # Create a language model offset_offset = chunk['words'][0].startOffset chunk_len = chunk['words'][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, resources.proto_langdir) k = standard_kaldi.Kaldi(resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir) wav_obj = wave.open(wavfile, 'r') wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() word_alignment = diff_align.align(ret, chunk_ms) # Adjust startOffset, endOffset, and timing to match originals for wd in word_alignment: if wd.end is not None: # Apply timing offset wd.start += start_t wd.end += start_t if wd.endOffset is not None: wd.startOffset += offset_offset wd.endOffset += offset_offset # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb( {"percent": len(realignments) / float(len(to_realign))})
def realign_sub(chunk): with wave.open(wavfile, "rb") as wav_obj: if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = chunk["end"].start duration = end_t - start_t # XXX: the minimum length seems bigger now (?) if duration < 0.75 or duration > 60: logging.debug( "cannot realign %d words with duration %f", len(chunk["words"]), duration, ) return # Create a language model offset_offset = chunk["words"][0].startOffset chunk_len = chunk["words"][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, resources.proto_langdir) k = standard_kaldi.Kaldi(resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir) with wave.open(wavfile, "rb") as wav_obj: wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() word_alignment = diff_align.align(ret, chunk_ms) for wd in word_alignment: wd.shift(time=start_t, offset=offset_offset) # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb( {"percent": len(realignments) / float(len(to_realign))})
def transcribe(self, wavfile, progress_cb=None): wav_obj = wave.open(wavfile, 'r') duration = wav_obj.getnframes() / float(wav_obj.getframerate()) n_chunks = int( math.ceil(duration / float(self.chunk_len - self.overlap_t))) chunks = [] def transcribe_chunk(idx): wav_obj = wave.open(wavfile, 'r') start_t = idx * (self.chunk_len - self.overlap_t) # Seek wav_obj.setpos(int(start_t * wav_obj.getframerate())) # Read frames buf = wav_obj.readframes( int(self.chunk_len * wav_obj.getframerate())) k = self.kaldi_queue.get() k.push_chunk(buf) ret = k.get_final() k.reset() self.kaldi_queue.put(k) chunks.append({"start": start_t, "words": ret}) logging.info('%d/%d' % (len(chunks), n_chunks)) if progress_cb is not None: progress_cb({ "message": ' '.join([X['word'] for X in ret]), "percent": len(chunks) / float(n_chunks) }) pool = Pool(min(n_chunks, self.nthreads)) pool.map(transcribe_chunk, range(n_chunks)) pool.close() chunks.sort(key=lambda x: x['start']) # Combine chunks words = [] for c in chunks: chunk_start = c['start'] chunk_end = chunk_start + self.chunk_len chunk_words = [ transcription.Word(**wd).shift(time=chunk_start) for wd in c['words'] ] # At chunk boundary cut points the audio often contains part of a # word, which can get erroneously identified as one or more different # in-vocabulary words. So discard one or more words near the cut points # (they'll be covered by the ovlerap anyway). # trim = min(0.25 * self.overlap_t, 0.5) if c is not chunks[0]: while len(chunk_words) > 1: chunk_words.pop(0) if chunk_words[0].end > chunk_start + trim: break if c is not chunks[-1]: while len(chunk_words) > 1: chunk_words.pop() if chunk_words[-1].start < chunk_end - trim: break words.extend(chunk_words) # Remove overlap: Sort by time, then filter out any Word entries in # the list that are adjacent to another entry corresponding to the same # word in the audio. words.sort(key=lambda word: word.start) words.append(transcription.Word(word="__dummy__")) words = [ words[i] for i in range(len(words) - 1) if not words[i].corresponds(words[i + 1]) ] return words
def realign(chunk): nonlocal ignored if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = final_end_t else: end_t = chunk["end"].start duration = end_t - start_t # XXX: the minimum length seems bigger now (?) if duration < 0.75 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk["words"]), duration)) ignored += 1 return # Create a language model offset_offset = chunk["words"][0].startOffset chunk_len = chunk["words"][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") realign_transcript = chunk_transcript.decode("utf-8").replace( "\n", " ") logging.debug("realign transcript: %s", realign_transcript) chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, resources.proto_langdir) wav_obj = wave.open(wavfile, "rb") wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) wav_obj.close() retries = 5 while retries > 0: try: k = standard_kaldi.Kaldi( resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir, ) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() break except BrokenPipeError: retries -= 1 if retries == 0: raise word_alignment = diff_align.align(ret, chunk_ms) for wd in word_alignment: wd.shift(time=start_t, offset=offset_offset) # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb({ "percent": (ignored + len(realignments)) / float(len(to_realign)) })