예제 #1
0
 def get_kaldi(self):
     # In theory, we could preserve these instances through a
     # session.
     return standard_kaldi.Kaldi(
         get_resource('data/nnet_a_gpu_online'),
         self.gen_hclg_filename,
         get_resource('PROTO_LANGDIR'))
예제 #2
0
    def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2):
        self.data_dir = data_dir
        self.nthreads = nthreads
        self.ntranscriptionthreads = ntranscriptionthreads

        proto_langdir = get_resource('PROTO_LANGDIR')
        vocab_path = os.path.join(proto_langdir, "graphdir/words.txt")
        with open(vocab_path) as f:
            self.vocab = metasentence.load_vocabulary(f)

        # load kaldi instances for full transcription
        gen_hclg_filename = get_resource('data/graph/HCLG.fst')
        
        if os.path.exists(gen_hclg_filename) and self.ntranscriptionthreads > 0:
            proto_langdir = get_resource('PROTO_LANGDIR')
            nnet_gpu_path = get_resource('data/nnet_a_gpu_online')
            
            kaldi_queue = Queue()
            for i in range(self.ntranscriptionthreads):
                kaldi_queue.put(standard_kaldi.Kaldi(
                    nnet_gpu_path,
                    gen_hclg_filename,
                    proto_langdir)
                )
            self.full_transcriber = MultiThreadedTranscriber(kaldi_queue, nthreads=self.ntranscriptionthreads)

        self._status_dicts = {}
예제 #3
0
    def realign(chunk):
        wav_obj = wave.open(wavfile, 'r')

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        if duration < 0.01 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk['words']), duration))
            return

        # Create a language model
        offset_offset = chunk['words'][0].startOffset
        chunk_len = chunk['words'][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        wav_obj = wave.open(wavfile, 'r')
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        # Adjust startOffset, endOffset, and timing to match originals
        for wd in word_alignment:
            if wd.end is not None:
                # Apply timing offset
                wd.start += start_t
                wd.end += start_t

            if wd.endOffset is not None:
                wd.startOffset += offset_offset
                wd.endOffset += offset_offset

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})
예제 #4
0
def build(resources, nthreads=4, hclg_path=None):

    if hclg_path is None: hclg_path = resources.full_hclg_path

    kaldi_queue = Queue()
    for i in range(nthreads):
        kaldi_queue.put(
            standard_kaldi.Kaldi(resources.nnet_gpu_path, hclg_path,
                                 resources.proto_langdir))
    return kaldi_queue
예제 #5
0
파일: multipass.py 프로젝트: veravox/gentle
    def realign_sub(chunk):
        with wave.open(wavfile, "rb") as wav_obj:
            if chunk["start"] is None:
                start_t = 0
            else:
                start_t = chunk["start"].end

            if chunk["end"] is None:
                end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
            else:
                end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug(
                "cannot realign %d words with duration %f",
                len(chunk["words"]),
                duration,
            )
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        with wave.open(wavfile, "rb") as wav_obj:
            wav_obj.setpos(int(start_t * wav_obj.getframerate()))
            buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})
예제 #6
0
        words.append(transcription.Word(word="__dummy__"))
        words = [
            words[i] for i in range(len(words) - 1)
            if not words[i].corresponds(words[i + 1])
        ]

        return words


if __name__ == '__main__':
    # full transcription
    from Queue import Queue
    from util import ffmpeg
    from gentle import standard_kaldi

    import sys

    import logging
    logging.getLogger().setLevel('INFO')

    k_queue = Queue()
    for i in range(3):
        k_queue.put(standard_kaldi.Kaldi())

    trans = MultiThreadedTranscriber(k_queue)

    with gentle.resampled(sys.argv[1]) as filename:
        out = trans.transcribe(filename)

    open(sys.argv[2], 'w').write(out.to_json())
예제 #7
0

if __name__ == '__main__':
    # full transcription
    from Queue import Queue
    import json
    import sys

    import logging
    logging.getLogger().setLevel('INFO')

    import gentle
    from gentle import standard_kaldi

    resources = gentle.Resources()

    k_queue = Queue()
    for i in range(3):
        k_queue.put(
            standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 resources.full_hclg_path,
                                 resources.proto_langdir))

    trans = MultiThreadedTranscriber(k_queue)

    with gentle.resampled(sys.argv[1]) as filename:
        out = trans.transcribe(filename)

    open(sys.argv[2],
         'w').write(transcription.Transcription(words=out).to_json())
예제 #8
0
    def realign(chunk):
        nonlocal ignored

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = final_end_t
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk["words"]), duration))
            ignored += 1
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        realign_transcript = chunk_transcript.decode("utf-8").replace(
            "\n", " ")
        logging.debug("realign transcript: %s", realign_transcript)
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        wav_obj = wave.open(wavfile, "rb")
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))
        wav_obj.close()

        retries = 5
        while retries > 0:
            try:
                k = standard_kaldi.Kaldi(
                    resources.nnet_gpu_path,
                    chunk_gen_hclg_filename,
                    resources.proto_langdir,
                )
                k.push_chunk(buf)
                ret = [transcription.Word(**wd) for wd in k.get_final()]
                k.stop()
                break
            except BrokenPipeError:
                retries -= 1
                if retries == 0:
                    raise

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb({
                "percent":
                (ignored + len(realignments)) / float(len(to_realign))
            })
예제 #9
0
    def transcribe(self, uid, transcript, audio, async):

        proto_langdir = get_resource('PROTO_LANGDIR')
        
        status = self.get_status(uid)

        status['status'] = 'STARTED'
        output = {
            'transcript': transcript
        }

        outdir = os.path.join(self.data_dir, 'transcriptions', uid)                

        tran_path = os.path.join(outdir, 'transcript.txt')
        with codecs.open(tran_path, 'w', 'utf-8') as tranfile:
            tranfile.write(transcript)
        audio_path = os.path.join(outdir, 'upload')
        with open(audio_path, 'w') as wavfile:
            wavfile.write(audio)

        status['status'] = 'ENCODING'

        wavfile = os.path.join(outdir, 'a.wav')
        if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0:
            status['status'] = 'ERROR'
            status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file."
            # Save the status so that errors are recovered on restart of the server
            # XXX: This won't work, because the endpoint will override this file
            with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
                json.dump(status, jsfile, indent=2)
            return

        #XXX: Maybe we should pass this wave object instead of the
        # file path to align_progress
        wav_obj = wave.open(wavfile, 'r')
        status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate())
        status['status'] = 'TRANSCRIBING'

        def on_progress(p):
            for k,v in p.items():
                status[k] = v

        if len(transcript.strip()) > 0:
            ms = metasentence.MetaSentence(transcript, self.vocab)
            ks = ms.get_kaldi_sequence()
            gen_hclg_filename = language_model.make_bigram_language_model(ks, proto_langdir)

            kaldi_queue = Queue()
            for i in range(self.nthreads):
                kaldi_queue.put(standard_kaldi.Kaldi(
                    get_resource('data/nnet_a_gpu_online'),
                    gen_hclg_filename,
                    proto_langdir)
                )

            mtt = MultiThreadedTranscriber(kaldi_queue, nthreads=self.nthreads)
        elif hasattr(self, 'full_transcriber'):
            mtt = self.full_transcriber
        else:
            status['status'] = 'ERROR'
            status['error']  = 'No transcript provided and no language model for full transcription'
            return

        words = mtt.transcribe(wavfile, progress_cb=on_progress)

        output = {}
        if len(transcript.strip()) > 0:
            # Clear queue (would this be gc'ed?)
            for i in range(self.nthreads):
                k = kaldi_queue.get()
                k.stop()

            # Align words
            output['words'] = diff_align.align(words, ms)
            output['transcript'] = transcript

            # Perform a second-pass with unaligned words
            logging.info("%d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words'])))

            status['status'] = 'ALIGNING'

            output['words'] = multipass.realign(wavfile, output['words'], ms, nthreads=self.nthreads, progress_cb=on_progress)

            logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words'])))
            
        else:
            # Match format
            output = make_transcription_alignment({"words": words})

        # ...remove the original upload
        os.unlink(os.path.join(outdir, 'upload'))

        # Save
        with open(os.path.join(outdir, 'align.json'), 'w') as jsfile:
            json.dump(output, jsfile, indent=2)
        with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile:
            csvfile.write(to_csv(output))

        # Inline the alignment into the index.html file.
        htmltxt = open(get_resource('www/view_alignment.html')).read()
        htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output)));
        open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt)

        status['status'] = 'OK'

        logging.info('done with transcription.')

        return output