示例#1
0
    def realign(chunk):
        wav_obj = wave.open(wavfile, 'r')

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        if duration < 0.01 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk['words']), duration))
            return

        # Create a language model
        offset_offset = chunk['words'][0].startOffset
        chunk_len = chunk['words'][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        wav_obj = wave.open(wavfile, 'r')
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        # Adjust startOffset, endOffset, and timing to match originals
        for wd in word_alignment:
            if wd.end is not None:
                # Apply timing offset
                wd.start += start_t
                wd.end += start_t

            if wd.endOffset is not None:
                wd.startOffset += offset_offset
                wd.endOffset += offset_offset

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})
示例#2
0
文件: serve.py 项目: afcarl/earmark
    def onchange(self, sender, change_doc):
        update = False
        if change_doc.get("doc", {}).get("type") == "command":
            # Save kaldi-sequence from the text
            seq = metasentence.MetaSentence(change_doc["doc"].get("text", ""),
                                            vocab).get_kaldi_sequence()
            change_doc["doc"]["_ks"] = seq
            self._command_seqs[change_doc["id"]] = seq
            # Set "sender" to None so that all peers get a change update
            sender = None
            update = True
        elif change_doc["type"] == 'delete' and change_doc[
                "id"] in self._command_seqs:
            del self._command_seqs[change_doc["id"]]
            update = True
        elif change_doc.get("doc", {}).get("type") == "audio-command":
            print 'got new audio command', change_doc['doc']
            self._pending_audio_commands.append(change_doc["doc"])

            self.subdir_resources['factory'].check_pending_audio_commands()

        minidb.DBFactory.onchange(self, sender, change_doc)

        if update:
            self.create_language_model()
            reactor.callInThread(
                self.subdir_resources['factory'].re_run_everything)
示例#3
0
 def __init__(self, resources, transcript, nthreads=4, **kwargs):
     self.kwargs = kwargs
     self.nthreads = nthreads
     self.transcript = transcript
     self.resources = resources
     self.ms = metasentence.MetaSentence(transcript, resources.vocab)
     ks = self.ms.get_kaldi_sequence()
     gen_hclg_filename = language_model.make_bigram_language_model(ks, resources.proto_langdir, **kwargs)
     self.queue = kaldi_queue.build(resources, hclg_path=gen_hclg_filename, nthreads=nthreads)
     self.mtt = MultiThreadedTranscriber(self.queue, nthreads=nthreads)
示例#4
0
    def realign_sub(chunk):
        with wave.open(wavfile, "rb") as wav_obj:
            if chunk["start"] is None:
                start_t = 0
            else:
                start_t = chunk["start"].end

            if chunk["end"] is None:
                end_t = wav_obj.getnframes() / float(wav_obj.getframerate())
            else:
                end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug(
                "cannot realign %d words with duration %f",
                len(chunk["words"]),
                duration,
            )
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        k = standard_kaldi.Kaldi(resources.nnet_gpu_path,
                                 chunk_gen_hclg_filename,
                                 resources.proto_langdir)

        with wave.open(wavfile, "rb") as wav_obj:
            wav_obj.setpos(int(start_t * wav_obj.getframerate()))
            buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))

        k.push_chunk(buf)
        ret = [transcription.Word(**wd) for wd in k.get_final()]
        k.stop()

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb(
                {"percent": len(realignments) / float(len(to_realign))})
示例#5
0
文件: diff_align.py 项目: wan/gentle
    for op, s1, e1, s2, e2 in opcodes:
        if op == 'delete':
            for i in range(s1, e1):
                yield (op, i, i+1, s2, s2)
        elif op == 'insert':
            for i in range(s2, e2):
                yield (op, s1, s1, i, i+1)
        else:
            len1 = e1-s1
            len2 = e2-s2
            for i1, i2 in zip(range(s1, e1), range(s2, e2)):
                yield (op, i1, i1 + 1, i2, i2 + 1)
            if len1 > len2:
                for i in range(s1 + len2, e1):
                    yield ('delete', i, i+1, e2, e2)
            if len2 > len1:
                for i in range(s2 + len1, e2):
                    yield ('insert', s1, s1, i, i+1)

if __name__=='__main__':
    TEXT_FILE = sys.argv[1]
    JSON_FILE = sys.argv[2]
    OUTPUT_FILE = sys.argv[3]

    ms = metasentence.MetaSentence(open(TEXT_FILE).read(), Resources().vocab)
    alignment = json.load(open(JSON_FILE))['words']

    out = align(alignment, ms)

    json.dump(out, open(OUTPUT_FILE, 'w'), indent=2)
示例#6
0
    def realign(chunk):
        nonlocal ignored

        if chunk["start"] is None:
            start_t = 0
        else:
            start_t = chunk["start"].end

        if chunk["end"] is None:
            end_t = final_end_t
        else:
            end_t = chunk["end"].start

        duration = end_t - start_t
        # XXX: the minimum length seems bigger now (?)
        if duration < 0.75 or duration > 60:
            logging.debug("cannot realign %d words with duration %f" %
                          (len(chunk["words"]), duration))
            ignored += 1
            return

        # Create a language model
        offset_offset = chunk["words"][0].startOffset
        chunk_len = chunk["words"][-1].endOffset - offset_offset
        chunk_transcript = ms.raw_sentence[offset_offset:offset_offset +
                                           chunk_len].encode("utf-8")
        realign_transcript = chunk_transcript.decode("utf-8").replace(
            "\n", " ")
        logging.debug("realign transcript: %s", realign_transcript)
        chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab)
        chunk_ks = chunk_ms.get_kaldi_sequence()

        chunk_gen_hclg_filename = language_model.make_bigram_language_model(
            chunk_ks, resources.proto_langdir)
        wav_obj = wave.open(wavfile, "rb")
        wav_obj.setpos(int(start_t * wav_obj.getframerate()))
        buf = wav_obj.readframes(int(duration * wav_obj.getframerate()))
        wav_obj.close()

        retries = 5
        while retries > 0:
            try:
                k = standard_kaldi.Kaldi(
                    resources.nnet_gpu_path,
                    chunk_gen_hclg_filename,
                    resources.proto_langdir,
                )
                k.push_chunk(buf)
                ret = [transcription.Word(**wd) for wd in k.get_final()]
                k.stop()
                break
            except BrokenPipeError:
                retries -= 1
                if retries == 0:
                    raise

        word_alignment = diff_align.align(ret, chunk_ms)

        for wd in word_alignment:
            wd.shift(time=start_t, offset=offset_offset)

        # "chunk" should be replaced by "words"
        realignments.append({"chunk": chunk, "words": word_alignment})

        if progress_cb is not None:
            progress_cb({
                "percent":
                (ignored + len(realignments)) / float(len(to_realign))
            })
示例#7
0
    def transcribe(self, uid, transcript, audio, async):

        proto_langdir = get_resource('PROTO_LANGDIR')
        
        status = self.get_status(uid)

        status['status'] = 'STARTED'
        output = {
            'transcript': transcript
        }

        outdir = os.path.join(self.data_dir, 'transcriptions', uid)                

        tran_path = os.path.join(outdir, 'transcript.txt')
        with codecs.open(tran_path, 'w', 'utf-8') as tranfile:
            tranfile.write(transcript)
        audio_path = os.path.join(outdir, 'upload')
        with open(audio_path, 'w') as wavfile:
            wavfile.write(audio)

        status['status'] = 'ENCODING'

        wavfile = os.path.join(outdir, 'a.wav')
        if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0:
            status['status'] = 'ERROR'
            status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file."
            # Save the status so that errors are recovered on restart of the server
            # XXX: This won't work, because the endpoint will override this file
            with open(os.path.join(outdir, 'status.json'), 'w') as jsfile:
                json.dump(status, jsfile, indent=2)
            return

        #XXX: Maybe we should pass this wave object instead of the
        # file path to align_progress
        wav_obj = wave.open(wavfile, 'r')
        status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate())
        status['status'] = 'TRANSCRIBING'

        def on_progress(p):
            for k,v in p.items():
                status[k] = v

        if len(transcript.strip()) > 0:
            ms = metasentence.MetaSentence(transcript, self.vocab)
            ks = ms.get_kaldi_sequence()
            gen_hclg_filename = language_model.make_bigram_language_model(ks, proto_langdir)

            kaldi_queue = Queue()
            for i in range(self.nthreads):
                kaldi_queue.put(standard_kaldi.Kaldi(
                    get_resource('data/nnet_a_gpu_online'),
                    gen_hclg_filename,
                    proto_langdir)
                )

            mtt = MultiThreadedTranscriber(kaldi_queue, nthreads=self.nthreads)
        elif hasattr(self, 'full_transcriber'):
            mtt = self.full_transcriber
        else:
            status['status'] = 'ERROR'
            status['error']  = 'No transcript provided and no language model for full transcription'
            return

        words = mtt.transcribe(wavfile, progress_cb=on_progress)

        output = {}
        if len(transcript.strip()) > 0:
            # Clear queue (would this be gc'ed?)
            for i in range(self.nthreads):
                k = kaldi_queue.get()
                k.stop()

            # Align words
            output['words'] = diff_align.align(words, ms)
            output['transcript'] = transcript

            # Perform a second-pass with unaligned words
            logging.info("%d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words'])))

            status['status'] = 'ALIGNING'

            output['words'] = multipass.realign(wavfile, output['words'], ms, nthreads=self.nthreads, progress_cb=on_progress)

            logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words'])))
            
        else:
            # Match format
            output = make_transcription_alignment({"words": words})

        # ...remove the original upload
        os.unlink(os.path.join(outdir, 'upload'))

        # Save
        with open(os.path.join(outdir, 'align.json'), 'w') as jsfile:
            json.dump(output, jsfile, indent=2)
        with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile:
            csvfile.write(to_csv(output))

        # Inline the alignment into the index.html file.
        htmltxt = open(get_resource('www/view_alignment.html')).read()
        htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output)));
        open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt)

        status['status'] = 'OK'

        logging.info('done with transcription.')

        return output