Exemplo n.º 1
0
def read_captions(captions, options):
    reader_kwargs = {
        'read_invalid_positioning': options.read_invalid_positioning
    }

    scc_reader = pycaption.SCCReader(**reader_kwargs)
    srt_reader = pycaption.SRTReader(**reader_kwargs)
    sami_reader = pycaption.SAMIReader(**reader_kwargs)
    dfxp_reader = pycaption.DFXPReader(**reader_kwargs)
    vtt_reader = pycaption.WebVTTReader(**reader_kwargs)

    if scc_reader.detect(captions):
        if options.lang:
            return scc_reader.read(captions,
                                   lang=options.lang,
                                   offset=int(options.offset))
        else:
            return scc_reader.read(captions, offset=int(options.offset))
    elif srt_reader.detect(captions):
        return srt_reader.read(captions)
    elif sami_reader.detect(captions):
        return sami_reader.read(captions)
    elif dfxp_reader.detect(captions):
        return dfxp_reader.read(captions)
    elif vtt_reader.detect(captions):
        return vtt_reader.read(captions)
    else:
        raise Exception('No caption format detected :(')
Exemplo n.º 2
0
    def mk_subs(self, transcriptions, sub_pathname):
        """
        Create a subtitle file for this video.
        It is currently a huge hack, but it works good enough.

        transcriptions:  list of start/end 'pointers' into the source
        sub_pathname: full path to output file
        """

        transcript_filename = '12022017 NBPY SCC.scc'
        # dt = transcript_filename[:8]

        transcript_pathname = os.path.join(self.show_dir, "assets",
                                           "transcripts", transcript_filename)

        # transcript_start = datetime.datetime.strptime(
        #     dt + " 10:06:56", '%m%d%Y %H:%M:%S' ) - \
        #            datetime.timedelta(0, 2, 158933)

        caps = open(transcript_pathname, encoding='iso-8859-1').read()

        transcript = pycaption.SCCReader().read(caps)
        language = transcript.get_languages()[0]  # ['en-US']
        captions = transcript.get_captions(language)

        out_captions = pycaption.CaptionList()

        for transcription in transcriptions:

            state = 0
            for c in captions:

                if c.format_start() == \
                        transcription['start']['timestamp']:
                    state = 1
                    offset = c.start - transcription['start'][
                        'video_time'] * 1000000
                    c.nodes[0].content = transcription['start']['text']

                if state == 1:

                    if c.format_start() == \
                            transcription['end']['timestamp']:
                        c.nodes[0].content=\
                                transcription['end']['text']
                        state = 0

                    c.start -= offset
                    c.end -= offset
                    out_captions.append(c)

        transcript.set_captions(language, out_captions)

        # writer = pycaption.DFXPWriter()
        writer = pycaption.SRTWriter()

        open(sub_pathname, 'wt').write(writer.write(transcript))

        return
Exemplo n.º 3
0
    def v4(self, episode):

        epoch = datetime.datetime(2017, 12, 2, 10, 6, 36, 841067)
        # 2017-12-02 10:06:36.841067

        ## Get transcription data
        transcript_filename = '12022017 NBPY SCC.scc'
        transcript_pathname = os.path.join( self.show_dir,
              "assets", "transcripts", transcript_filename )
        caps = open(transcript_pathname, encoding='iso-8859-1').read()

        transcript = pycaption.SCCReader().read( caps )
        language = transcript.get_languages()[0] # ['en-US']
        captions = transcript.get_captions( language )

        cls = Cut_List.objects.filter(
            episode=episode, apply=True).order_by('sequence')
        # transcriptions = get_transcriptions(cls)
        for cl in cls:
            print( cl.get_start_wall() )

            cl_start = ( cl.get_start_wall() - epoch
                    ).total_seconds() * 1000000
            cl_end = ( cl.get_end_wall() - epoch
                    ).total_seconds() * 1000000

            state = 0
            for c in captions:

                # look for start
                if state == 0:
                    if c.start > cl_start - 4000000:
                        print( "start: {}".format(cl.start))
                        state = 1

                # print a bunch of start
                if state == 1:
                    print("{} {}".format(c.format_start(), c.get_text() ))

                    if c.start > cl_start + 4000000:
                        print()
                        state = 2

                # look for end
                if state == 2:
                    if c.start > cl_end - 4000000:
                        print( "end: {}".format(cl.end))
                        state = 3

                # print a bunch of end
                if state == 3:
                    print("{} {}".format(c.format_start(), c.get_text() ))

                    if c.start > cl_end + 4000000:
                        print()
                        state = 4
Exemplo n.º 4
0
def main(argv):
    inputfile = ''
    inputType = ''
    outputType = ''

    try:
        opts, args = getopt.getopt(argv, "h:i:f:t:")
    except getopt.GetoptError:
        print('test.py -i <inputfile> -f <intputType> -t <outputType>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -f <intputType> -t <outputType>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-f", "--sfile"):
            inputType = arg
        elif opt in ("-t", "--tfile"):
            outputType = arg

    if inputType == outputType:
        print('Error: input type and output type are same format')
        sys.exit(1)

    with io.open(inputfile) as f:
        str1 = f.read()
    inputValue = inputType.lower()

    if inputValue == 'scc':
        c = pycaption.SCCReader().read(str1)
    elif inputValue == 'srt':
        c = pycaption.SRTReader().read(str1)
    elif inputValue == 'dfxp':
        c = pycaption.DFXPReader().read(str1)
    elif inputValue == 'webvtt':
        c = pycaption.WebVTTReader().read(str1)
    else:
        print('Error: invalid input type. <srt/scc/webvtt/dfxp> allowed')
        sys.exit(1)

    outputValue = outputType.lower()
    if outputValue == 'scc':
        print(pycaption.SCCWriter().write(c))
    elif outputValue == 'srt':
        print(pycaption.SRTWriter().write(c))
    elif outputValue == 'dfxp':
        print(pycaption.DFXPWriter().write(c))
    elif outputValue == 'webvtt':
        print(pycaption.WebVTTWriter().write(c))
    else:
        print('Error: invalid output type. <srt/scc/webvtt/dfxp> allowed')
        sys.exit(1)
Exemplo n.º 5
0
    def v3(self, episode):

        ## Get transcription data
        transcript_filename = '12022017 NBPY SCC.scc'
        transcript_pathname = os.path.join( self.show_dir,
              "assets", "transcripts", transcript_filename )
        caps = open(transcript_pathname, encoding='iso-8859-1').read()

        transcript = pycaption.SCCReader().read( caps )
        language = transcript.get_languages()[0] # ['en-US']
        captions = transcript.get_captions( language )

        ## Get markes for this video
        cls = Cut_List.objects.filter(
            episode=episode, apply=True).order_by('sequence')
        transcriptions = get_transcriptions(cls)

        for transcription in transcriptions:
            pprint(transcription)

            state = 0
            for c in captions:

                if c.format_start() == \
                        transcription['start']['timestamp']:

                    state=1
                    offset = c.start - transcription['start']['video_time'] * 1000000
                    wc = transcription['start']['wallclock']
                    # walltime that transcription file started.
                    epoch = wc - datetime.timedelta(microseconds = c.start )

                    print( "c: {c}\nc.start: {start}\nwall_clock: {wallclock}".format(
                        c=c, start=c.start, wallclock=wc ) )
                    print("epoch: {}".format( epoch ))

                    print("import sys; sys.exit()"); import code; code.interact(local=locals())

                if state==1:

                    if c.format_start() == \
                            transcription['end']['timestamp']:
                        c.nodes[0].content=\
                                transcription['end']['text']
                        state = 0

                    c.start -= offset
                    c.end -= offset
def read_captions(captions, options):
    scc_reader = pycaption.SCCReader()
    srt_reader = pycaption.SRTReader()
    sami_reader = pycaption.SAMIReader()
    dfxp_reader = pycaption.DFXPReader()

    if scc_reader.detect(captions):
        if options.lang:
            return scc_reader.read(captions, lang=options.lang,
                                   offset=int(options.offset))
        else:
            return scc_reader.read(captions, offset=float(options.offset))
    elif srt_reader.detect(captions):
        return srt_reader.read(captions)
    elif sami_reader.detect(captions):
        return sami_reader.read(captions)
    elif dfxp_reader.detect(captions):
        return dfxp_reader.read(captions)
    else:
        raise Exception('No caption format detected :(')
Exemplo n.º 7
0
    def v6(self, episode):

        def show_near( x, wall ):

            from_epoch = ( wall - epoch
                    ).total_seconds() * 1000000

            state = 0
            for c in captions:

                if state == 0:
                    if c.start > from_epoch - 9000000:
                        print( "{}: {}".format(x, wall))
                        state = 1

                if state == 1:
                    print("{} {}".format(c.format_start(), c.get_text() ))

                    if c.start > from_epoch + 26000000:
                        print()
                        return



        epoch = datetime.datetime(2017, 12, 2, 10, 6, 36, 841067)
        # 2017-12-02 10:06:36.841067

        ## Get transcription data
        transcript_filename = '12022017 NBPY SCC.scc'
        transcript_pathname = os.path.join( self.show_dir,
              "assets", "transcripts", transcript_filename )
        caps = open(transcript_pathname, encoding='iso-8859-1').read()

        transcript = pycaption.SCCReader().read( caps )
        language = transcript.get_languages()[0] # ['en-US']
        captions = transcript.get_captions( language )

        cls = Cut_List.objects.filter(
            episode=episode, apply=True).order_by('sequence')
        show_near( "start", cls.first().get_start_wall() )
        show_near( "end", cls.last().get_end_wall() )