def save_transcription(item, clips, speakers=None, engine=None, raw_files=None, logger=None): """ Save an automatically-generated transcript for `item`. `clips`: Array of Clip objects `speakers`: (optional): Array of Speaker objects `engine`: FIXME `raw_files`: FIXME Any raw files resulting from the transcription. Each file is represented by a dict with keys `content_type`, `file_name` and `body`. """ if not speakers: speakers = [] if not logger: logger = logging.get_logger(__name__) logger.info(u"Saving transcript with %d speakers, %d clips for item %s", len(speakers), len(clips), item) with transaction.commit_on_success(): track = Track(item=item, kind='captions', name='Automatic transcription') track.save() for speaker in speakers: speaker.track = track speaker.save() for clip in clips: # Despite appearances, the following line is actually # necessary to make the speaker_id foreign key update # correctly. Yuck. clip.speaker = clip.speaker # Necessary! clip.track = track Clip.objects.bulk_create(clips)
def item_add_track(request, item_id): item = get_object_or_404(Item, pk=item_id) new_track_form = upload_track_form = request_transcript_form = None if 'new_track' in request.POST: new_track_form = TrackMetadataForm(request.POST) if new_track_form.is_valid(): data = new_track_form.cleaned_data track = Track(item=item, **data) track.save() return redirect(edit_track, track_id=track.id) elif 'upload_track' in request.POST: upload_track_form = UploadTrackForm(request.POST, request.FILES) if upload_track_form.is_valid(): data = upload_track_form.cleaned_data file_type = data['file_type'] upload = request.FILES['caption_file'] if file_type == 'srt': speakers = [] clips = vtt.read(upload) elif file_type == 'xmp': speakers = [] clips = xmp.read(upload) elif file_type == 'koemei': xml = ET.parse(upload) objects = koemei.reader.read(xml) speakers = objects['speakers'] clips = objects['clips'] elif file_type == 'sphinx': speakers = [] clips = sphinx.reader.read_clips(upload) else: raise Exception('Unrecognised caption file format') track = Track(item=item, name=upload.name) track.save() for speaker in speakers: speaker.track = track speaker.save() for clip in clips: clip.track = track # Next line is necessary to save the foreign key # correctly. Sigh. clip.speaker = clip.speaker clip.save() return redirect(edit_track, track_id=track.id) elif 'request_transcript' in request.POST: request_transcript_form = RequestTranscriptForm(request.POST) if request_transcript_form.is_valid(): engine = request_transcript_form.cleaned_data['engine'] if engine not in spindle.transcribe.engine_map(): raise Exception(u"Bad value for engine: {}".format(engine)) item.request_transcription(engine) return redirect('spindle_queue') if new_track_form is None: new_track_form = TrackMetadataForm() if upload_track_form is None: upload_track_form = UploadTrackForm() if request_transcript_form is None: request_transcript_form = RequestTranscriptForm() return render( request, 'spindle/item_add_transcript.html', { 'item': item, 'new_track_form': new_track_form, 'upload_track_form': upload_track_form, 'request_transcript_form': request_transcript_form, })
def item_add_track(request, item_id): item = get_object_or_404(Item, pk=item_id) new_track_form = upload_track_form = request_transcript_form = None if 'new_track' in request.POST: new_track_form = TrackMetadataForm(request.POST) if new_track_form.is_valid(): data = new_track_form.cleaned_data track = Track(item=item, **data) track.save() return redirect(edit_track, track_id=track.id) elif 'upload_track' in request.POST: upload_track_form = UploadTrackForm(request.POST, request.FILES) if upload_track_form.is_valid(): data = upload_track_form.cleaned_data file_type = data['file_type'] upload = request.FILES['caption_file'] if file_type == 'srt': speakers = [] clips = vtt.read(upload) elif file_type == 'xmp': speakers = [] clips = xmp.read(upload) elif file_type == 'koemei': xml = ET.parse(upload) objects = koemei.reader.read(xml) speakers = objects['speakers'] clips = objects['clips'] elif file_type == 'sphinx': speakers = [] clips = sphinx.reader.read_clips(upload) else: raise Exception('Unrecognised caption file format') track = Track(item=item, name=upload.name) track.save() for speaker in speakers: speaker.track = track speaker.save() for clip in clips: clip.track = track # Next line is necessary to save the foreign key # correctly. Sigh. clip.speaker = clip.speaker clip.save() return redirect(edit_track, track_id=track.id) elif 'request_transcript' in request.POST: request_transcript_form = RequestTranscriptForm(request.POST) if request_transcript_form.is_valid(): engine = request_transcript_form.cleaned_data['engine'] if engine not in spindle.transcribe.engine_map(): raise Exception(u"Bad value for engine: {}".format(engine)) item.request_transcription(engine) return redirect('spindle_queue') if new_track_form is None: new_track_form = TrackMetadataForm() if upload_track_form is None: upload_track_form = UploadTrackForm() if request_transcript_form is None: request_transcript_form = RequestTranscriptForm() return render(request, 'spindle/item_add_transcript.html', { 'item': item, 'new_track_form': new_track_form, 'upload_track_form': upload_track_form, 'request_transcript_form': request_transcript_form, })
def handle(self, *args, **options): index_filename = args[0] data_dir = args[1] verbose = False url_not_found = [] file_not_found = [] items = Item.objects.bulk_fetch() index = open(index_filename) total_count = 0 for line in index: total_count += 1 index.seek(0, 0) for idx, line in enumerate(index): url, filename = line.split(" ") filename = filename.strip() self.stderr.write(u'\n{:4.1f}% {}\n'.format( 100 * float(idx) / total_count, url)) try: item = items.audio[url] except KeyError: try: item = items.video[url] except KeyError: self.stderr.write( u"No item found -- not imported\n\n".format(url)) url_not_found.append(url) continue self.stderr.write(u'{} {}\n'.format(item.id, item.name)) existing_tracks = item.track_set.filter(name__exact=TRACK_NAME) if sum(track.clip_count for track in existing_tracks.all()): self.stderr.write("Already imported\n\n") continue track = Track(item=item, name=TRACK_NAME) track.save() speaker = Speaker(track=track, name="Speaker 1") speaker.save() path = os.path.join(data_dir, filename) try: clips = [] with open(path) as sphinx_output: for clip in read_clips(sphinx_output, speaker=speaker): if verbose: self.stderr.write(u"{:6.1f} {:6.1f} {}\n".format( clip.intime, clip.outtime, clip.caption_text)) clip.track = track clips.append(clip) Clip.objects.bulk_create(clips) item.archive() self.stderr.write('\n\n') except Exception as err: self.stderr.write(u"Error in reading {}: {}".format(path, err)) file_not_found.append(path) continue if url_not_found: self.stderr.write("{} URLs not found in database:\n".format( len(url_not_found))) for url in url_not_found: self.stderr.write(u'\t{}\n'.format(url)) if file_not_found: self.stderr.write("{} files not found:\n".format( len(file_not_found))) for path in file_not_found: self.stderr.write(u'\t{}\n'.format(path))
def handle(self, *args, **options): index_filename = args[0] data_dir = args[1] verbose = False url_not_found = [] file_not_found = [] items = Item.objects.bulk_fetch() index = open(index_filename) total_count = 0 for line in index: total_count += 1 index.seek(0,0) for idx, line in enumerate(index): url, filename = line.split(" ") filename = filename.strip() self.stderr.write(u'\n{:4.1f}% {}\n'.format( 100 * float(idx) / total_count, url)) try: item = items.audio[url] except KeyError: try: item = items.video[url] except KeyError: self.stderr.write(u"No item found -- not imported\n\n".format(url)) url_not_found.append(url) continue self.stderr.write(u'{} {}\n'.format(item.id, item.name)) existing_tracks = item.track_set.filter(name__exact = TRACK_NAME) if sum(track.clip_count for track in existing_tracks.all()): self.stderr.write("Already imported\n\n") continue track = Track(item=item, name=TRACK_NAME) track.save() speaker = Speaker(track=track, name="Speaker 1") speaker.save() path = os.path.join(data_dir, filename) try: clips = [] with open(path) as sphinx_output: for clip in read_clips(sphinx_output, speaker = speaker): if verbose: self.stderr.write(u"{:6.1f} {:6.1f} {}\n".format( clip.intime, clip.outtime, clip.caption_text)) clip.track = track clips.append(clip) Clip.objects.bulk_create(clips) item.archive() self.stderr.write('\n\n') except Exception as err : self.stderr.write(u"Error in reading {}: {}".format(path, err)) file_not_found.append(path) continue if url_not_found: self.stderr.write("{} URLs not found in database:\n".format( len(url_not_found))) for url in url_not_found: self.stderr.write(u'\t{}\n'.format(url)) if file_not_found: self.stderr.write("{} files not found:\n".format( len(file_not_found))) for path in file_not_found: self.stderr.write(u'\t{}\n'.format(path))