Exemplo n.º 1
0
def write_captions(content, options):
    writer_kwargs = {
        'video_width':
        int(options.video_width) if options.video_width else None,  # noqa
        'video_height':
        int(options.video_height) if options.video_height else None  # noqa
    }
    if options.sami:
        print(
            pycaption.SAMIWriter(
                **writer_kwargs).write(content).encode("utf-8"))  # noqa
    if options.dfxp:
        print(
            pycaption.DFXPWriter(
                **writer_kwargs).write(content).encode("utf-8"))  #noqa
    if options.srt:
        print(
            pycaption.SRTWriter(
                **writer_kwargs).write(content).encode("utf-8"))  # noqa
    if options.transcript:
        print(
            pycaption.TranscriptWriter(
                **writer_kwargs).write(content).encode("utf-8"))  # noqa
    if options.vtt:
        print(
            pycaption.WebVTTWriter(
                **writer_kwargs).write(content).encode("utf-8"))  # noqa
    if options.unpositioned_dfxp:
        print(
            pycaption.dfxp.SinglePositioningDFXPWriter(
                **writer_kwargs).write(content).encode("utf-8"))
Exemplo n.º 2
0
    def mk_subs(self, transcriptions, sub_pathname):
        """
        Create a subtitle file for this video.
        It is currently a huge hack, but it works good enough.

        transcriptions:  list of start/end 'pointers' into the source
        sub_pathname: full path to output file
        """

        transcript_filename = '12022017 NBPY SCC.scc'
        # dt = transcript_filename[:8]

        transcript_pathname = os.path.join(self.show_dir, "assets",
                                           "transcripts", transcript_filename)

        # transcript_start = datetime.datetime.strptime(
        #     dt + " 10:06:56", '%m%d%Y %H:%M:%S' ) - \
        #            datetime.timedelta(0, 2, 158933)

        caps = open(transcript_pathname, encoding='iso-8859-1').read()

        transcript = pycaption.SCCReader().read(caps)
        language = transcript.get_languages()[0]  # ['en-US']
        captions = transcript.get_captions(language)

        out_captions = pycaption.CaptionList()

        for transcription in transcriptions:

            state = 0
            for c in captions:

                if c.format_start() == \
                        transcription['start']['timestamp']:
                    state = 1
                    offset = c.start - transcription['start'][
                        'video_time'] * 1000000
                    c.nodes[0].content = transcription['start']['text']

                if state == 1:

                    if c.format_start() == \
                            transcription['end']['timestamp']:
                        c.nodes[0].content=\
                                transcription['end']['text']
                        state = 0

                    c.start -= offset
                    c.end -= offset
                    out_captions.append(c)

        transcript.set_captions(language, out_captions)

        # writer = pycaption.DFXPWriter()
        writer = pycaption.SRTWriter()

        open(sub_pathname, 'wt').write(writer.write(transcript))

        return
Exemplo n.º 3
0
def write_captions(content, options):
    if options.scc:
        print pycaption.SCCWriter().write(content).encode("utf-8")
    if options.sami:
        print pycaption.SAMIWriter().write(content).encode("utf-8")
    if options.dfxp:
        print pycaption.DFXPWriter().write(content).encode("utf-8")
    if options.srt:
        print pycaption.SRTWriter().write(content).encode("utf-8")
    if options.transcript:
        print pycaption.TranscriptWriter().write(content).encode("utf-8")
Exemplo n.º 4
0
def main(argv):
    inputfile = ''
    inputType = ''
    outputType = ''

    try:
        opts, args = getopt.getopt(argv, "h:i:f:t:")
    except getopt.GetoptError:
        print('test.py -i <inputfile> -f <intputType> -t <outputType>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -f <intputType> -t <outputType>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-f", "--sfile"):
            inputType = arg
        elif opt in ("-t", "--tfile"):
            outputType = arg

    if inputType == outputType:
        print('Error: input type and output type are same format')
        sys.exit(1)

    with io.open(inputfile) as f:
        str1 = f.read()
    inputValue = inputType.lower()

    if inputValue == 'scc':
        c = pycaption.SCCReader().read(str1)
    elif inputValue == 'srt':
        c = pycaption.SRTReader().read(str1)
    elif inputValue == 'dfxp':
        c = pycaption.DFXPReader().read(str1)
    elif inputValue == 'webvtt':
        c = pycaption.WebVTTReader().read(str1)
    else:
        print('Error: invalid input type. <srt/scc/webvtt/dfxp> allowed')
        sys.exit(1)

    outputValue = outputType.lower()
    if outputValue == 'scc':
        print(pycaption.SCCWriter().write(c))
    elif outputValue == 'srt':
        print(pycaption.SRTWriter().write(c))
    elif outputValue == 'dfxp':
        print(pycaption.DFXPWriter().write(c))
    elif outputValue == 'webvtt':
        print(pycaption.WebVTTWriter().write(c))
    else:
        print('Error: invalid output type. <srt/scc/webvtt/dfxp> allowed')
        sys.exit(1)
Exemplo n.º 5
0
def download_subs(link, output_folder, name):
    x = requests.get(
        link,
        headers={
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:70.0) Gecko/20100101 Firefox/70.0'
        })
    caption_set = pycaption.DFXPReader().read(x.text)
    results = pycaption.SRTWriter().write(caption_set)
    with io.open(os.path.join(output_folder, name + '.srt'),
                 'w',
                 encoding='utf-8') as f:
        f.write(results)
def write_captions(content, options):
    if options.scc:
        print(pycaption.SCCWriter().write(content))
    if options.sami:
        print(pycaption.SAMIWriter().write(content))
    if options.dfxp:
        print(pycaption.DFXPWriter().write(content))
    if options.srt:
        print(pycaption.SRTWriter().write(content))
    if options.vtt:
        print(pycaption.WebVTTWriter().write(content))
    if options.transcript:
        # import TranscriptWriter only if necessary, since it requires additional dependencies
        from pycaption.transcript import TranscriptWriter
        print(TranscriptWriter().write(content))
Exemplo n.º 7
0
def write_captions(content, options, lang='', filename=''):
    if options.sami:
        print pycaption.SAMIWriter().write(content).encode("utf-8")
    if options.dfxp:
        print pycaption.DFXPWriter().write(content).encode("utf-8")
    if options.webvtt:
        location = os.getcwd()
        f = open(location + '/captions/' + filename + '_' + lang + '.vtt',
                 'w')  #Save vtt files into captions folder
        f.write(pycaption.WebVTTWriter().write(content, lang).encode("utf-8"))
        # print pycaption.WebVTTWriter().write(content, lang).encode("utf-8")
    if options.srt:
        print pycaption.SRTWriter().write(content).encode("utf-8")
    if options.transcript:
        print pycaption.TranscriptWriter().write(content).encode("utf-8")
Exemplo n.º 8
0
def convert_subs(subtitle_path: Path):
    """Convert any valid subtitle file to srt subtitles for processing, using pycaption; then process them.

    Args:
        subtitle_path -- path of subtitles to convert
    Return True if successful, False otherwise.
    """
    with open(subtitle_path, encoding='utf-8') as sub_file:
        subtitles = sub_file.read()
    subtitle_reader_class = pycaption.detect_format(subtitles)
    if not subtitle_reader_class:
        return False

    subtitle_reader = subtitle_reader_class()
    srt_subtitles = pycaption.SRTWriter().write(subtitle_reader.read(subtitles))
    with open(subtitle_path.with_suffix('.srt'), 'w', encoding='utf-8') as sub_file:
        sub_file.write(srt_subtitles)

    srt_to_timestamps(subtitle_path.with_suffix('.srt'))
    return True
    def run(self):
        for input_file in self.input_files:
            input_file_name = os.path.basename(input_file)
            input_type = os.path.splitext(input_file)[1].lower()[1:]
            output_file = os.path.join(
                self.output_folder,
                os.path.splitext(input_file_name)[0] + '.' + self.output_type)
            output_file_name = os.path.basename(output_file)
            if os.path.exists(output_file) and not self.overwrite_on:
                self.log_signal.emit("{}을 건너뜁니다...".format(input_file_name))
                continue
            self.log_signal.emit("{}을 읽습니다...".format(input_file_name))

            with open(output_file, 'w',
                      encoding=self.output_encoding) as file_out:
                reader = None
                encoding = None
                content = None

                with open(input_file, 'rb') as file_in:
                    encoding = chardet.detect(file_in.read())['encoding']

                with open(input_file, 'r', encoding=encoding) as file_in:
                    content = file_in.read()

                if input_type == "smi":
                    reader = pycaption.SAMIReader().read(content)
                elif input_type == "srt":
                    reader = pycaption.SRTReader().read(content)

                if self.output_type == "smi":
                    file_out.write(pycaption.SAMIWriter().write(reader))
                elif self.output_type == "srt":
                    file_out.write(pycaption.SRTWriter().write(reader))
                elif self.output_type == "txt":
                    file_out.write(TextWriter().write(reader))
                elif self.output_type == "ats":
                    open(output_file, 'wb').write(AtsWriter().write(reader))

                self.log_signal.emit("{}으로 변환했습니다".format(output_file_name))
Exemplo n.º 10
0
    def download_from_ism(self, url, output_name, output_format):
        r = self.session.get(f'{url}/manifest')
        manifest = xmltodict.parse(r.content, force_list={'StreamIndex', 'c'})
        self.logger.debug(json.dumps(manifest, indent=4))

        for (index, stream) in enumerate(
                manifest['SmoothStreamingMedia']['StreamIndex']):
            if stream['@Type'] != 'text':
                continue

            lang = stream['@Language'].lower()

            fmt = stream['QualityLevel']['@FourCC'].upper()
            if fmt != 'TTML':
                self.logger.error(
                    f'Stream has unsupported subtitle format: {fmt!r}')
                sys.exit(1)

            index -= 2
            output = f'{output_name.replace(" ", ".")}.{lang}.{index}.srt'
            output = pathvalidate.sanitize_filename(output)
            output = os.path.join(self.output_dir, output)
            self.logger.info(f'Saving subtitle track #{index} to {output}')

            path = stream['@Url'].replace('{bitrate}',
                                          stream['QualityLevel']['@Bitrate'])
            t = 0
            ts = []

            for c in stream['c']:
                if c.get('@t'):
                    t = int(c['@t'])
                    ts.append(t)

                if not c.get('@d'):
                    # Stream only has a single segment
                    break

                for i in range(c.get('@r', 1)):
                    t += int(c['@d'])
                    ts.append(t)

            ts = ts[:-1]  # Remove nonexistent last segment

            xml = None

            for (i, t) in enumerate(ts):
                #print(f'\rDownloading: {t/ts[-1]:.0%}', end='')
                self.logger.debug(f'Downloading segment {i + 1} of {len(ts)}')
                seg_url = f'{url}/{path.replace("{start time}", str(t))}'
                seg = self.session.get(seg_url).content

                if not seg:
                    # Empty segment
                    continue

                data = self.ismt_to_ttml(seg).decode('utf-8')

                assert '{{BR}}' not in data, 'input data contains br placeholder'
                data = re.sub(r'<br ?/>', '{{BR}}', data)

                xml_seg = xmltodict.parse(
                    data,
                    force_list={'p'},
                    process_namespaces=True,
                    namespaces={
                        'http://www.w3.org/XML/1998/namespace': None,
                        'http://www.w3.org/2006/10/ttaf1': None,
                        'http://www.w3.org/2006/10/ttaf1#metadata': None,
                        'http://www.w3.org/2006/10/ttaf1#styling': None,
                    },
                )

                if i == 0:
                    xml = xml_seg

                    fps_base = xml['tt'].get('@ttp:frameRate')
                    fps_mult = xml['tt'].get('@ttp:frameRateMultiplier')

                    if xml['tt']['body']['div'] is None:
                        xml['tt']['body']['div'] = {'p': []}

                    if fps_base:
                        if fps_mult:
                            mult = [int(x) for x in fps_mult.split(' ')]
                            mult = truediv(*mult)
                        else:
                            mult = 1

                        fps = fps_base * fps_mult
                    else:
                        fps = 30  # Per TTML spec

                else:
                    div = xml_seg['tt']['body']['div']

                    if div is None:
                        # Empty subtitle file
                        continue

                    subs = div['p']

                    scale = int(stream['@TimeScale'])
                    offset = t / scale

                    for p in subs:
                        for a in ('@begin', '@end'):
                            tc = p[a]
                            (h, m, s, f) = [int(x) for x in tc.split(':')]
                            total = round(
                                h * 3600 + m * 60 + s + f / fps + offset, 3)
                            p[a] = f'{total}s'

                        begin = float(p['@begin'][:-1])
                        end = float(p['@end'][:-1])

                        if end < begin:
                            self.logger.error(
                                f'End time is earlier than start time ({end} < {begin})',
                            )
                            return

                    xml['tt']['body']['div']['p'].extend(subs)

            xml_data = xmltodict.unparse(xml)
            xml_data = xml_data.replace('{{BR}}', '<br />')

            os.makedirs(self.output_dir, exist_ok=True)

            with open(output, 'wb') as fd:
                if output_format == 'ttml':
                    fd.write(xml_data.encode('utf-8-sig'))
                elif output_format == 'srt':
                    self.logger.debug('Converting to SRT')
                    r = pycaption.DFXPReader().read(xml_data)
                    w = pycaption.SRTWriter().write(r)
                    fd.write(w.encode('utf-8-sig'))