Exemplo n.º 1
0
 def _test_srt_to_scc_to_srt_conversion(self, srt_captions):
     captions_1 = SRTReader().read(srt_captions)
     scc_results = SCCWriter().write(captions_1)
     scc_captions = SCCReader().read(scc_results)
     srt_results = SRTWriter().write(scc_captions)
     captions_2 = SRTReader().read(srt_results)
     self.assertCaptionSetAlmostEquals(captions_1, captions_2,
                                       TOLERANCE_MICROSECONDS)
Exemplo n.º 2
0
    def test_multiple_lines_for_one_sentence(self, samples_srt_same_time):
        caption_set = SRTReader().read(samples_srt_same_time)
        results = SRTWriter().write(caption_set)
        sentences = re.split(r"\d{2}:\d{2}:\d{2},\d{3} -->", results)
        sentences.pop(0)

        assert 3 == len(sentences)
Exemplo n.º 3
0
    def test_srt_to_webvtt_conversion(self, sample_webvtt_from_srt,
                                      sample_srt):
        caption_set = SRTReader().read(sample_srt)
        results = WebVTTWriter().write(caption_set)

        assert isinstance(results, str)
        self.assert_webvtt_equals(sample_webvtt_from_srt, results)
Exemplo n.º 4
0
 def test_srt_to_dfxp_conversion(self):
     caption_set = SRTReader().read(SAMPLE_SRT)
     results = DFXPWriter().write(caption_set)
     self.assertTrue(isinstance(results, six.text_type))
     self.assertDFXPEquals(SAMPLE_DFXP,
                           results,
                           ignore_styling=True,
                           ignore_spans=True)
Exemplo n.º 5
0
    def test_srt_to_dfxp_conversion(self, sample_dfxp, sample_srt):
        caption_set = SRTReader().read(sample_srt)
        results = DFXPWriter().write(caption_set)

        assert isinstance(results, str)
        self.assert_dfxp_equals(sample_dfxp,
                                results,
                                ignore_styling=True,
                                ignore_spans=True)
Exemplo n.º 6
0
def _srt_gen_from_url(base_url, end_time=3660, verbose=True):

    dt = 60
    t0 = 0
    t1 = t0 + dt

    has_next = True
    first = True
    srt = ''

    last_end = 0.0
    while has_next:

        if verbose:
            print('fetching captions from ' + base_url +
                  '?t={}/{}'.format(t0, t1))

        if first:
            first = False
            res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)})
            res.raise_for_status()

            srt = res.text.replace(u'\ufeff', '')

        else:
            res = requests.get(base_url, params={'t': '{}/{}'.format(t0, t1)})

            res.raise_for_status()

            srt = res.text

        t0 = t1 + 1
        t1 = t1 + dt
        has_next = t1 <= end_time

        if srt:

            cc = CaptionConverter()
            cc.read(srt, SRTReader())
            captions = cc.captions.get_captions(lang='en-US')

            if first:
                last_end = captions[-1].end

            else:
                for caption in captions:
                    caption.start += last_end
                    caption.end += last_end

                last_end = captions[-1].end

            srt = cc.write(SRTWriter())

            yield srt.replace('\n\n', ' \n\n')

        else:
            yield ''
Exemplo n.º 7
0
def route_subtitles(course_id, lecture_id):
    subtitles_url = (
            'https://class.coursera.org/%s-001/lecture/subtitles?q=%d_en' %
            (course_id, lecture_id))
    r = requests.get(subtitles_url)
    try:
        converter = CaptionConverter()
        converter.read(r.text, SRTReader())
        subtitles = converter.write(WebVTTWriter())
    except CaptionReadNoCaptions:
        subtitles = ''
    return Response(subtitles, content_type='text/vtt')
Exemplo n.º 8
0
    def srt2ttml(srt_file_path, ttml_file_path=None):
        """Convert SubRip subtitles to TTML subtitles.

        Arguments:
            srt_file_path {string} -- The path to the SubRip file.
            ttml_file_path {string} -- The path to the TTML file.
        """

        converter = CaptionConverter()
        with open(srt_file_path, "r", encoding="utf8") as file:
            converter.read(file.read(), SRTReader())
        if ttml_file_path is None:
            ttml_file_path = srt_file_path.replace(".srt", ".xml")
        with open(ttml_file_path, "wb") as file:
            file.write(converter.write(DFXPWriter()).encode("utf-8"))
Exemplo n.º 9
0
def convert_srt_to_dfxp(times, generate_output=False):

    all_time = 0
    counter = 0
    input_files = os.listdir(INPUT_DIRECTORY)
    input_files_count = len(input_files)
    skipped_files = 0

    for input_file in input_files:

        if input_file.startswith('.'):  # Skip hidden files and SRT master file
            skipped_files += 1
            continue

        filename = input_file.split('.')[0]
        file_path = '{}/{}'.format(INPUT_DIRECTORY, input_file)

        with open(file_path, 'r', encoding='utf-8') as fh:
            try:
                srt_data = fh.read()
            except UnicodeDecodeError:
                print("Problem with {}".format(file_path))
                raise

        total_file_time = 0

        for _ in range(times):
            t0 = timer()
            dfxp_data = DFXPWriter().write(SRTReader().read(srt_data))
            t1 = timer()
            time_taken = t1 - t0
            total_file_time += time_taken

            if generate_output and "{}.xml".format(filename) not in os.listdir(
                    OUTPUT_DIRECTORY):
                with open("{}/{}.xml".format(OUTPUT_DIRECTORY, filename),
                          "w") as new_dfxp_file:
                    new_dfxp_file.write(dfxp_data)

        all_time += total_file_time
        counter += 1
        sys.stdout.write("\r{}/{} files completed.".format(
            counter, input_files_count))

    print(
        "\nConverting {} files took an average of {} seconds over {} iteration{}.\n{} files were skipped."
        .format(counter, all_time / times, times, "s" if times > 1 else "",
                skipped_files))
Exemplo n.º 10
0
def _make_ts_from_srt(srt):

    c = CaptionConverter()

    srt = re.sub('$', ' ', srt).replace('\n\n', ' \n\n')

    srt = unicodedata.normalize('NFC', srt)

    srt = ''.join(i for i in srt
                  if unicodedata.category(i)[0] != 'C' or i == '\n')

    c.read(srt, SRTReader())

    ts = c.write(TranscriptWriter()).replace(u'>>> ', u'>>').replace('\n', ' ')

    return ts.split('>>')
Exemplo n.º 11
0
    def srt2ttml(srt_file_path: str, ttml_file_path: Optional[str] = None) -> None:
        """Convert SubRip subtitles to TTML subtitles.

        Arguments:
            srt_file_path {string} -- The path to the SubRip file.
            ttml_file_path {string} -- The path to the TTML file.
        """

        file: Union[TextIO, BinaryIO]
        converter = CaptionConverter()
        encoding = Utils.detect_encoding(srt_file_path)
        with open(srt_file_path, "r", encoding=encoding) as file:
            converter.read(file.read(), SRTReader())
        if ttml_file_path is None:
            ttml_file_path = srt_file_path.replace(".srt", ".xml")
        with open(ttml_file_path, "wb") as file:
            file.write(converter.write(DFXPWriter()).encode(encoding))
Exemplo n.º 12
0
    def srt2sami(srt_file_path: str, sami_file_path: Optional[str] = None) -> None:
        """Convert SubRip subtitles to SAMI subtitles.

        Arguments:
            srt_file_path {string} -- The path to the SubRip file.
            sami_file_path {string} -- The path to the SAMI file.
        """

        file: Union[TextIO, BinaryIO]
        converter = CaptionConverter()
        encoding = Utils.detect_encoding(srt_file_path)
        with open(srt_file_path, "r", encoding=encoding) as file:
            converter.read(file.read(), SRTReader())
        if sami_file_path is None:
            sami_file_path = srt_file_path.replace(".srt", ".smi")
        with open(sami_file_path, "wb") as file:
            file.write(converter.write(SAMIWriter()).encode(encoding))
Exemplo n.º 13
0
def subtitle(request, title, no):
    t = re.sub('\(.*?\)', '', title)[:-1]
    film = subscene.search(t, "English")

    zip = requests.get(subscene.zipped_url(film.subtitles[int(no)]))

    fp = StringIO(zip.content)
    archive = zipfile.ZipFile(fp, 'r')
    srt = archive.read(archive.namelist()[0])
    soup = BeautifulSoup(srt)
    # print(soup.originalEncoding)
    converter = CaptionConverter()
    unistring = unicode(srt.decode(soup.originalEncoding))
    if "utf-8" in soup.originalEncoding:
        unistring = unistring[1:]
    converter.read(unistring, SRTReader())
    html_parser = HTMLParser.HTMLParser()

    return HttpResponse(html_parser.unescape(converter.write(WebVTTWriter()).encode('ascii', 'ignore')),
                        content_type="text/vtt")
Exemplo n.º 14
0
def convert_subtitles_to_vtt(input_file: str, output_file: str):
    """Convert .srt subtitles to .vtt for web playback."""
    logger.info(f'Converting {input_file} to {output_file}')
    with open(input_file, mode='rb') as raw_input_content:
        encoding = chardet.detect(raw_input_content.read())['encoding']

    with open(input_file, mode='r', encoding=encoding) as srt_file:
        srt_contents = str(srt_file.read())

    converter = CaptionConverter()
    try:
        converter.read(srt_contents, SRTReader())
    except CaptionReadNoCaptions:
        logger.exception(f'Failed to convert {input_file} to {output_file}')
        return False  # Likely UTF-16 subtitles
    vtt_captions = converter.write(WebVTTWriter())

    with open(output_file, mode='w', encoding='utf-8-sig') as vtt_file:
        vtt_file.write(vtt_captions)

    return True
Exemplo n.º 15
0
def from_srt(input_f, output_f):
  """
    Takes an input SRT file or filename and writes out VTT contents to the given 
    output file or filename
  """
  with vtt_open(input_f, 'r') as f:
    orig = f.read()

    detect = chardet.detect(orig)
    encoding = detect['encoding']
    confidence = detect['confidence']
    default_subrip_encoding = 'cp1252' # standard for SubRip files

    if confidence < 0.9:
      encoding = default_subrip_encoding

    backups = [default_subrip_encoding,'utf8']

    while True:
      try:
        print "ENCODING: " + encoding
        contents = orig.decode(encoding)
        break
      except UnicodeDecodeError as e:
        if len(backups) is 0:
          raise
          break
        encoding = backups.pop(0)


    # caption converter seems to have a tough time with the BOM on
    # Python < 2.7.8, so ditch it if it exists.
    contents = contents[3:] if contents[:3] == codecs.BOM_UTF8 else contents

  converter = CaptionConverter()
  converter.read(contents, SRTReader())
  contents = converter.write(WebVTTWriter())

  with vtt_open(output_f, 'w') as o:
    o.write(contents.encode('utf-8')[:-1])
Exemplo n.º 16
0
def dfxpconv(filename=str, ckeep=bool):
    fsrt = open(filename, "r", encoding='utf-8', errors='ignore')
    srtcont = fsrt.read()
    fdfxp = open(filename.replace(".srt", ".dfxp"), "wb")

    # Super Netflix Compatibility
    # Converter that is used (DFXPWriter) uses a different set of rules
    # than what Super Netflix (and Netflix) wants.
    # Doing this will avoid the "M7034" error.
    # It will also remove any formatting as there is no such thing as formatting
    # in the Netflix Player.
    dfxpedit = DFXPWriter().write(SRTReader().read(srtcont))
    dfxpedit = dfxpedit.replace(
        "<tt xml:lang=\"en\" xmlns:=\"http://www.w3.org/ns/ttml\" "
        "xmlns:tts=\"http://www.w3.org/ns/ttml#styling\">",
        "<tt xml:lang='en' xmlns='http://www.w3.org/2006/10/ttaf1' "
        "xmlns:tts='http://www.w3.org/2006/10/ttaf1#style'>")
    dfxpedit = dfxpedit.replace("<div region=\"bottom\" xml:lang=\"en-US\">",
                                "<div xml:id=\"captions\">")
    dfxpedit = dfxpedit.replace(
        "&lt;font face=\"Open Sans Semibold\" size=\"36\"&gt;", "")
    dfxpedit = dfxpedit.replace("&lt;/font&gt;", "")
    dfxpedit = dfxpedit.replace(" region=\"bottom\" style=\"default\"", "")
    dfxpedit = dfxpedit.replace("&lt;b&gt;", "")
    dfxpedit = dfxpedit.replace("&lt;/b&gt;", "")
    dfxpedit = dfxpedit.replace("&lt;i&gt;", "")
    dfxpedit = dfxpedit.replace("&lt;/i&gt;", "")
    dfxpedit = dfxpedit.replace("{\\an8}", "")

    dfxpedit = dfxpedit.encode('utf-8', errors='replace')
    fdfxp.write(dfxpedit)
    fsrt.close()
    fdfxp.close()
    if ckeep:
        return
    if not ckeep:
        os.remove(filename)
        return
Exemplo n.º 17
0
    def test_caption_length(self):
        captions = SRTReader().read(SAMPLE_SRT)

        self.assertEquals(7, len(captions.get_captions(u"en-US")))
Exemplo n.º 18
0
    def test_extra_empty_line(self):
        captions = SRTReader().read(SAMPLE_SRT_BLANK_LINES)

        self.assertEquals(2, len(captions.get_captions("en-US")))
Exemplo n.º 19
0
 def test_empty_file(self):
     self.assertRaises(CaptionReadNoCaptions,
                       SRTReader().read, SAMPLE_SRT_EMPTY)
Exemplo n.º 20
0
    def test_proper_timestamps(self):
        captions = SRTReader().read(SAMPLE_SRT)
        paragraph = captions.get_captions(u"en-US")[2]

        self.assertEquals(17000000, paragraph.start)
        self.assertEquals(18752000, paragraph.end)
Exemplo n.º 21
0
 def test_detection(self):
     self.assertTrue(SRTReader().detect(SAMPLE_SRT.decode(u'utf-8')))
Exemplo n.º 22
0
 def test_extra_trailing_empty_line(self):
     captions = SRTReader().read(SAMPLE_SRT_TRAILING_BLANKS)
     self.assertEquals(2, len(captions.get_captions(u"en-US")))
Exemplo n.º 23
0
def build_srt_reader():
    return SubtitleReader(SRTReader(), requires_language=True)
Exemplo n.º 24
0
 def test_extra_trailing_empty_line(self):
     captions = SRTReader().read(SAMPLE_SRT_TRAILING_BLANKS)
     self.assertEqual(2, len(captions.get_captions(u"en-US")))
Exemplo n.º 25
0
    def test_srt_to_microdvd_conversion(self, sample_microdvd, sample_srt):
        caption_set = SRTReader().read(sample_srt)
        results = MicroDVDWriter().write(caption_set)

        assert isinstance(results, str)
        self.assert_microdvd_equals(sample_microdvd, results)
Exemplo n.º 26
0
    def test_proper_pcc_format(self):
        captions = SRTReader().read(SAMPLE_SRT)

        self.assertEquals(set(["captions", "styles"]), set(captions.keys()))
        self.assertEquals(7, len(captions["captions"]["en-US"]))
Exemplo n.º 27
0
    def test_proper_timestamps(self):
        captions = SRTReader().read(SAMPLE_SRT)
        paragraph = captions.get_captions(u"en-US")[2]

        self.assertEquals(17000000, paragraph.start)
        self.assertEquals(18752000, paragraph.end)
Exemplo n.º 28
0
 def test_numeric_captions(self):
     captions = SRTReader().read(SAMPLE_SRT_NUMERIC)
     self.assertEquals(7, len(captions.get_captions(u"en-US")))
Exemplo n.º 29
0
 def test_srt_to_sami_conversion(self):
     caption_set = SRTReader().read(SAMPLE_SRT)
     results = SAMIWriter().write(caption_set)
     self.assertTrue(isinstance(results, six.text_type))
     self.assertSAMIEquals(SAMPLE_SAMI, results)
Exemplo n.º 30
0
    def test_caption_length(self):
        captions = SRTReader().read(SAMPLE_SRT.decode(u'utf-8'))

        self.assertEquals(8, len(captions.get_captions(u"en-US")))
Exemplo n.º 31
0
 def test_srt_to_webvtt_conversion(self):
     caption_set = SRTReader().read(SAMPLE_SRT)
     results = WebVTTWriter().write(caption_set)
     self.assertTrue(isinstance(results, six.text_type))
     self.assertWebVTTEquals(SAMPLE_WEBVTT_FROM_SRT, results)
Exemplo n.º 32
0
    def test_caption_length(self):
        captions = SRTReader().read(SAMPLE_SRT)

        self.assertEquals(7, len(captions.get_captions(u"en-US")))
Exemplo n.º 33
0
 def setUpClass(cls):
     cls.captions = SRTReader().read(SAMPLE_SRT.decode(u'utf-8'))
     cls.captions_utf8 = SRTReader().read(SAMPLE_SRT_UTF8.decode(u'utf-8'))
     cls.captions_unicode = SRTReader().read(SAMPLE_SRT_UNICODE)
Exemplo n.º 34
0
 def test_numeric_captions(self):
     captions = SRTReader().read(SAMPLE_SRT_NUMERIC)
     self.assertEquals(7, len(captions.get_captions(u"en-US")))
Exemplo n.º 35
0
 def test_srt_reader_only_supports_unicode_input(self):
     with self.assertRaises(InvalidInputError):
         SRTReader().read('')
Exemplo n.º 36
0
 def test_detection(self):
     self.assertTrue(SRTReader().detect(SAMPLE_SRT))
Exemplo n.º 37
0
    def test_caption_length(self):
        captions = SRTReader().read(SAMPLE_SRT.decode(u'utf-8'))

        self.assertEquals(8, len(captions.get_captions(u"en-US")))