def test_to_csv(): tests = [ [{"words": []}, '', 'empty'], [ { "words": [ {"case": "success", "word": "A", "alignedWord": "a", "start": 0, "end": 1}, ] }, 'A,a,0,1\r\n', 'single word', ], [ { "words": [ {"case": "success", "word": "A", "alignedWord": "a", "start": 0, "end": 1}, {"case": "not-found-in-audio", "word": "B", "alignedWord": "b", "start": 2, "end": 3}, ] }, 'A,a,0,1\r\nB,b,2,3\r\n', 'multi-word', ], [ { "words": [ {"case": "not-found-in-audio", "word": "A", "alignedWord": "a", "start": 0, "end": 1}, ] }, 'A,a,0,1\r\n', 'not found in audio', ], [ { "words": [ {"case": "not-found-in-transcript", "word": "A", "alignedWord": "a", "start": 0, "end": 1}, ] }, '', 'not found in transcript', ], ] for test in tests: input, want, name = test got = to_csv(input) assert_equals(want, got)
def save(): with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: json.dump(output, jsfile, indent=2) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(to_csv(output))
def transcribe(self, uid, transcript, audio, async): proto_langdir = get_resource('PROTO_LANGDIR') status = self.get_status(uid) status['status'] = 'STARTED' output = { 'transcript': transcript } outdir = os.path.join(self.data_dir, 'transcriptions', uid) tran_path = os.path.join(outdir, 'transcript.txt') with codecs.open(tran_path, 'w', 'utf-8') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) status['status'] = 'ENCODING' wavfile = os.path.join(outdir, 'a.wav') if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0: status['status'] = 'ERROR' status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return #XXX: Maybe we should pass this wave object instead of the # file path to align_progress wav_obj = wave.open(wavfile, 'r') status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate()) status['status'] = 'TRANSCRIBING' def on_progress(p): for k,v in p.items(): status[k] = v if len(transcript.strip()) > 0: ms = metasentence.MetaSentence(transcript, self.vocab) ks = ms.get_kaldi_sequence() gen_hclg_filename = language_model.make_bigram_language_model(ks, proto_langdir) kaldi_queue = Queue() for i in range(self.nthreads): kaldi_queue.put(standard_kaldi.Kaldi( get_resource('data/nnet_a_gpu_online'), gen_hclg_filename, proto_langdir) ) mtt = MultiThreadedTranscriber(kaldi_queue, nthreads=self.nthreads) elif hasattr(self, 'full_transcriber'): mtt = self.full_transcriber else: status['status'] = 'ERROR' status['error'] = 'No transcript provided and no language model for full transcription' return words = mtt.transcribe(wavfile, progress_cb=on_progress) output = {} if len(transcript.strip()) > 0: # Clear queue (would this be gc'ed?) for i in range(self.nthreads): k = kaldi_queue.get() k.stop() # Align words output['words'] = diff_align.align(words, ms) output['transcript'] = transcript # Perform a second-pass with unaligned words logging.info("%d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words']))) status['status'] = 'ALIGNING' output['words'] = multipass.realign(wavfile, output['words'], ms, nthreads=self.nthreads, progress_cb=on_progress) logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words']))) else: # Match format output = make_transcription_alignment({"words": words}) # ...remove the original upload os.unlink(os.path.join(outdir, 'upload')) # Save with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: json.dump(output, jsfile, indent=2) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(to_csv(output)) # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output))); open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) status['status'] = 'OK' logging.info('done with transcription.') return output
def test_to_csv(): tests = [ [{ "words": [] }, '', 'empty'], [ { "words": [ { "case": "success", "word": "A", "alignedWord": "a", "start": 0, "end": 1 }, ] }, 'A,a,0,1\r\n', 'single word', ], [ { "words": [ { "case": "success", "word": "A", "alignedWord": "a", "start": 0, "end": 1 }, { "case": "not-found-in-audio", "word": "B", "alignedWord": "b", "start": 2, "end": 3 }, ] }, 'A,a,0,1\r\nB,b,2,3\r\n', 'multi-word', ], [ { "words": [ { "case": "not-found-in-audio", "word": "A", "alignedWord": "a", "start": 0, "end": 1 }, ] }, 'A,a,0,1\r\n', 'not found in audio', ], [ { "words": [ { "case": "not-found-in-transcript", "word": "A", "alignedWord": "a", "start": 0, "end": 1 }, ] }, '', 'not found in transcript', ], ] for test in tests: input, want, name = test got = to_csv(input) assert_equals(want, got)