Пример #1
0
def save_transcription(item,
                       clips,
                       speakers=None,
                       engine=None,
                       raw_files=None,
                       logger=None):
    """
    Save an automatically-generated transcript for `item`.

    `clips`: Array of Clip objects

    `speakers`: (optional): Array of Speaker objects

    `engine`: FIXME

    `raw_files`: FIXME Any raw files resulting from the transcription.
    Each file is represented by a dict with keys `content_type`,
    `file_name` and `body`.
    """
    if not speakers: speakers = []
    if not logger: logger = logging.get_logger(__name__)

    logger.info(u"Saving transcript with %d speakers, %d clips for item %s",
                len(speakers), len(clips), item)

    with transaction.commit_on_success():
        track = Track(item=item,
                      kind='captions',
                      name='Automatic transcription')
        track.save()

        for speaker in speakers:
            speaker.track = track
            speaker.save()

        for clip in clips:
            # Despite appearances, the following line is actually
            # necessary to make the speaker_id foreign key update
            # correctly. Yuck.
            clip.speaker = clip.speaker  # Necessary!
            clip.track = track

        Clip.objects.bulk_create(clips)
Пример #2
0
def save_transcription(item, clips, speakers=None, engine=None, raw_files=None,
                       logger=None):
    """
    Save an automatically-generated transcript for `item`.

    `clips`: Array of Clip objects

    `speakers`: (optional): Array of Speaker objects

    `engine`: FIXME

    `raw_files`: FIXME Any raw files resulting from the transcription.
    Each file is represented by a dict with keys `content_type`,
    `file_name` and `body`.
    """
    if not speakers: speakers = []
    if not logger: logger = logging.get_logger(__name__)

    logger.info(u"Saving transcript with %d speakers, %d clips for item %s",
                len(speakers), len(clips), item)

    with transaction.commit_on_success():
        track = Track(item=item, kind='captions', name='Automatic transcription')
        track.save()

        for speaker in speakers:
            speaker.track = track
            speaker.save()

        for clip in clips:
            # Despite appearances, the following line is actually
            # necessary to make the speaker_id foreign key update
            # correctly. Yuck.
            clip.speaker = clip.speaker # Necessary!
            clip.track = track

        Clip.objects.bulk_create(clips)
Пример #3
0
def item_add_track(request, item_id):
    item = get_object_or_404(Item, pk=item_id)
    new_track_form = upload_track_form = request_transcript_form = None

    if 'new_track' in request.POST:
        new_track_form = TrackMetadataForm(request.POST)
        if new_track_form.is_valid():
            data = new_track_form.cleaned_data
            track = Track(item=item, **data)
            track.save()
            return redirect(edit_track, track_id=track.id)
    elif 'upload_track' in request.POST:
        upload_track_form = UploadTrackForm(request.POST, request.FILES)
        if upload_track_form.is_valid():
            data = upload_track_form.cleaned_data
            file_type = data['file_type']
            upload = request.FILES['caption_file']

            if file_type == 'srt':
                speakers = []
                clips = vtt.read(upload)
            elif file_type == 'xmp':
                speakers = []
                clips = xmp.read(upload)
            elif file_type == 'koemei':
                xml = ET.parse(upload)
                objects = koemei.reader.read(xml)
                speakers = objects['speakers']
                clips = objects['clips']
            elif file_type == 'sphinx':
                speakers = []
                clips = sphinx.reader.read_clips(upload)
            else:
                raise Exception('Unrecognised caption file format')

            track = Track(item=item, name=upload.name)
            track.save()

            for speaker in speakers:
                speaker.track = track
                speaker.save()

            for clip in clips:
                clip.track = track
                # Next line is necessary to save the foreign key
                # correctly. Sigh.
                clip.speaker = clip.speaker
                clip.save()

        return redirect(edit_track, track_id=track.id)
    elif 'request_transcript' in request.POST:
        request_transcript_form = RequestTranscriptForm(request.POST)
        if request_transcript_form.is_valid():
            engine = request_transcript_form.cleaned_data['engine']
            if engine not in spindle.transcribe.engine_map():
                raise Exception(u"Bad value for engine: {}".format(engine))

            item.request_transcription(engine)
            return redirect('spindle_queue')

    if new_track_form is None: new_track_form = TrackMetadataForm()
    if upload_track_form is None: upload_track_form = UploadTrackForm()
    if request_transcript_form is None:
        request_transcript_form = RequestTranscriptForm()

    return render(
        request, 'spindle/item_add_transcript.html', {
            'item': item,
            'new_track_form': new_track_form,
            'upload_track_form': upload_track_form,
            'request_transcript_form': request_transcript_form,
        })
Пример #4
0
def item_add_track(request, item_id):
    item = get_object_or_404(Item, pk=item_id)
    new_track_form = upload_track_form = request_transcript_form = None

    if 'new_track' in request.POST:
        new_track_form = TrackMetadataForm(request.POST)
        if new_track_form.is_valid():
            data = new_track_form.cleaned_data
            track = Track(item=item, **data)
            track.save()
            return redirect(edit_track, track_id=track.id)
    elif 'upload_track' in request.POST:
        upload_track_form = UploadTrackForm(request.POST, request.FILES)
        if upload_track_form.is_valid():
            data = upload_track_form.cleaned_data
            file_type = data['file_type']
            upload = request.FILES['caption_file']

            if file_type == 'srt':
                speakers = []
                clips = vtt.read(upload)
            elif file_type == 'xmp':
                speakers = []
                clips = xmp.read(upload)
            elif file_type == 'koemei':
                xml = ET.parse(upload)
                objects = koemei.reader.read(xml)
                speakers = objects['speakers']
                clips = objects['clips']
            elif file_type == 'sphinx':
                speakers = []
                clips = sphinx.reader.read_clips(upload)
            else:
                raise Exception('Unrecognised caption file format')

            track = Track(item=item, name=upload.name)
            track.save()

            for speaker in speakers:
                speaker.track = track
                speaker.save()

            for clip in clips:
                clip.track = track
                # Next line is necessary to save the foreign key
                # correctly. Sigh.
                clip.speaker = clip.speaker
                clip.save()

        return redirect(edit_track, track_id=track.id)
    elif 'request_transcript' in request.POST:
        request_transcript_form = RequestTranscriptForm(request.POST)
        if request_transcript_form.is_valid():
            engine = request_transcript_form.cleaned_data['engine']
            if engine not in spindle.transcribe.engine_map():
                raise Exception(u"Bad value for engine: {}".format(engine))

            item.request_transcription(engine)
            return redirect('spindle_queue')

    if new_track_form is None: new_track_form = TrackMetadataForm()
    if upload_track_form is None: upload_track_form = UploadTrackForm()
    if request_transcript_form is None: request_transcript_form = RequestTranscriptForm()

    return render(request, 'spindle/item_add_transcript.html', {
            'item': item,
            'new_track_form': new_track_form,
            'upload_track_form': upload_track_form,
            'request_transcript_form': request_transcript_form,
            })
    def handle(self, *args, **options):
        index_filename = args[0]
        data_dir = args[1]
        verbose = False

        url_not_found = []
        file_not_found = []

        items = Item.objects.bulk_fetch()

        index = open(index_filename)
        total_count = 0
        for line in index:
            total_count += 1

        index.seek(0, 0)
        for idx, line in enumerate(index):
            url, filename = line.split(" ")
            filename = filename.strip()

            self.stderr.write(u'\n{:4.1f}% {}\n'.format(
                100 * float(idx) / total_count, url))

            try:
                item = items.audio[url]
            except KeyError:
                try:
                    item = items.video[url]
                except KeyError:
                    self.stderr.write(
                        u"No item found -- not imported\n\n".format(url))
                    url_not_found.append(url)
                    continue

            self.stderr.write(u'{} {}\n'.format(item.id, item.name))
            existing_tracks = item.track_set.filter(name__exact=TRACK_NAME)

            if sum(track.clip_count for track in existing_tracks.all()):
                self.stderr.write("Already imported\n\n")
                continue

            track = Track(item=item, name=TRACK_NAME)
            track.save()

            speaker = Speaker(track=track, name="Speaker 1")
            speaker.save()

            path = os.path.join(data_dir, filename)

            try:
                clips = []
                with open(path) as sphinx_output:
                    for clip in read_clips(sphinx_output, speaker=speaker):
                        if verbose:
                            self.stderr.write(u"{:6.1f} {:6.1f} {}\n".format(
                                clip.intime, clip.outtime, clip.caption_text))
                        clip.track = track
                        clips.append(clip)

                Clip.objects.bulk_create(clips)
                item.archive()
                self.stderr.write('\n\n')

            except Exception as err:
                self.stderr.write(u"Error in reading {}: {}".format(path, err))
                file_not_found.append(path)
                continue

        if url_not_found:
            self.stderr.write("{} URLs not found in database:\n".format(
                len(url_not_found)))
            for url in url_not_found:
                self.stderr.write(u'\t{}\n'.format(url))

        if file_not_found:
            self.stderr.write("{} files not found:\n".format(
                len(file_not_found)))
            for path in file_not_found:
                self.stderr.write(u'\t{}\n'.format(path))
Пример #6
0
    def handle(self, *args, **options):
        index_filename = args[0]
        data_dir = args[1]
        verbose = False

        url_not_found = []
        file_not_found = []

        items = Item.objects.bulk_fetch()

        index = open(index_filename)
        total_count = 0
        for line in index: total_count += 1

        index.seek(0,0)
        for idx, line in enumerate(index):
            url, filename = line.split(" ")
            filename = filename.strip()

            self.stderr.write(u'\n{:4.1f}% {}\n'.format(
                    100 * float(idx) / total_count, url))

            try:
                item = items.audio[url]
            except KeyError:
                try:
                    item = items.video[url]
                except KeyError:
                    self.stderr.write(u"No item found -- not imported\n\n".format(url))
                    url_not_found.append(url)
                    continue

            self.stderr.write(u'{} {}\n'.format(item.id, item.name))
            existing_tracks = item.track_set.filter(name__exact = TRACK_NAME)

            if sum(track.clip_count for track in existing_tracks.all()):
                self.stderr.write("Already imported\n\n")
                continue

            track = Track(item=item, name=TRACK_NAME)
            track.save()

            speaker = Speaker(track=track, name="Speaker 1")
            speaker.save()

            path = os.path.join(data_dir, filename)

            try:
                clips = []
                with open(path) as sphinx_output:
                    for clip in read_clips(sphinx_output, speaker = speaker):
                        if verbose:
                            self.stderr.write(u"{:6.1f} {:6.1f} {}\n".format(
                                    clip.intime, clip.outtime, clip.caption_text))
                        clip.track = track
                        clips.append(clip)

                Clip.objects.bulk_create(clips)
                item.archive()
                self.stderr.write('\n\n')

            except Exception as err :
                self.stderr.write(u"Error in reading {}: {}".format(path, err))
                file_not_found.append(path)
                continue

        if url_not_found:
            self.stderr.write("{} URLs not found in database:\n".format(
                    len(url_not_found)))
            for url in url_not_found:
                self.stderr.write(u'\t{}\n'.format(url))

        if file_not_found:
            self.stderr.write("{} files not found:\n".format(
                    len(file_not_found)))
            for path in file_not_found:
                self.stderr.write(u'\t{}\n'.format(path))