def write_into(self, output_file, eol=None, include_indexes=False): """ write_into(output_file [, eol]) Serialize current state into `output_file`. `output_file` -> Any instance that respond to `write()`, typically a file object If include_indexes is True the cue indexes will be included in the file. """ self._check_valid_len() output_eol = eol or self.eol output_file.write("WEBVTT{0}{0}".format(output_eol)) for item in self: string_repr = str(item) if output_eol != '\n': string_repr = string_repr.replace('\n', output_eol) if include_indexes: output_file.write(str(item.index) + output_eol) output_file.write(string_repr) # Only add trailing eol if it's not already present. # It was kept in the WebVTTItem's text before but it really # belongs here. Existing applications might give us subtitles # which already contain a trailing eol though. if not string_repr.endswith(2 * output_eol): output_file.write(output_eol)
def __init__(self, index=0, start=None, end=None, text='', position=''): try: self.index = int(index) except (TypeError, ValueError): # try to cast as int, but it's not mandatory self.index = index self.start = WebVTTTime.coerce(start or 0) self.end = WebVTTTime.coerce(end or 0) self.position = str(position) self.text = str(text)
def test_eol_preservation(self): # Tests input eol is kept after saving self.temp_eol_path = os.path.join(self.static_path, 'temp_eol_preserv.vtt') end_of_lines = ['\n', '\r', '\r\n'] enc = 'utf-8' for eols in end_of_lines: input_eol = open(self.temp_eol_path, 'wb') input_eol.write( str('00:01:00,000 --> 00:02:00,000' + eols + 'TestEOLPreservation' + eols)) input_eol.close() input_file = open(self.temp_eol_path, 'rU', encoding=enc) input_file.read() self.assertEqual(eols, input_file.newlines) vtt_file = pyvtt.open(self.temp_eol_path, encoding=enc) vtt_file.save(self.temp_eol_path, eol=input_file.newlines) output_file = open(self.temp_eol_path, 'rU', encoding=enc) output_file.read() self.assertEqual(output_file.newlines, input_file.newlines) os.remove(self.temp_eol_path)
class WebVTTItem(ComparableMixin): """ WebVTTItem(index, start, end, text, position) start, end -> WebVTTTime or coercible. text -> unicode: text content for item. position -> unicode: raw vtt "display coordinates" string """ ITEM_PATTERN = str('%s --> %s%s\n%s\n') TIMESTAMP_SEPARATOR = '-->' def __init__(self, index=0, start=None, end=None, text='', position=''): try: self.index = int(index) except (TypeError, ValueError): # try to cast as int, but it's not mandatory self.index = index self.start = WebVTTTime.coerce(start or 0) self.end = WebVTTTime.coerce(end or 0) self.position = str(position) self.text = str(text) @property def duration(self): return self.end - self.start @property def text_without_tags(self): return self._text_tag_cleaner('<', '>') @property def text_without_brackets(self): return self._text_tag_cleaner('\[', '\]') @property def text_without_keys(self): return self._text_tag_cleaner('{', '}') def _text_tag_cleaner(self, before_delimiter, after_delimiter): return re.compile(r"{0}[^>]*?{1}".format(before_delimiter, after_delimiter)).sub( '', self.text) @property def text_without_trailing_spaces(self): return self.text.strip() @property def characters_per_second(self): characters_count = len(self.text_without_tags.replace('\n', '')) try: return characters_count / (self.duration.ordinal / 1000.0) except ZeroDivisionError: return 0.0 def text_with_replacements(self, replacements_map={}): for replaced, replacement in replacements_map.iteritems(): self.text = self.text.replace(replaced, replacement) return self.text def __str__(self): position = ' %s' % self.position if self.position.strip() else '' return self.ITEM_PATTERN % (self.start, self.end, position, self.text) if is_py2: __unicode__ = __str__ def __str__(self): raise NotImplementedError('Use unicode() instead!') def _cmpkey(self): return (self.start, self.end) def shift(self, *args, **kwargs): """ shift(hours, minutes, seconds, milliseconds, ratio) Add given values to start and end attributes. All arguments are optional and have a default value of 0. """ self.start.shift(*args, **kwargs) self.end.shift(*args, **kwargs) @classmethod def from_string(cls, source): return cls.from_lines(source.splitlines(True)) @classmethod def from_lines(cls, lines): if len(lines) < 2: raise InvalidItem() lines = [l.rstrip("\n\r") for l in lines ] # All cases are considered: '\n', '\r\n', '\r' lines[0] = lines[0].rstrip() index = None if cls.TIMESTAMP_SEPARATOR not in lines[0]: index = lines.pop(0) start, end, position = cls.split_timestamps(lines[0]) body = '\n'.join(lines[1:]) return cls(index, start, end, body, position) @classmethod def split_timestamps(cls, line): timestamps = line.split(cls.TIMESTAMP_SEPARATOR) if len(timestamps) != 2: raise InvalidItem() start, end_and_position = timestamps end_and_position = end_and_position.lstrip().split(' ', 1) end = end_and_position[0] position = end_and_position[1] if len(end_and_position) > 1 else '' return (s.strip() for s in (start, end, position))
def test_idempotence(self): vtt = WebVTTItem.from_string(self.vtt) self.assertEqual(str(vtt), self.vtt) item = WebVTTItem.from_string(self.coordinates) self.assertEqual(str(item), self.coordinates[2:])
def test_serialization(self): self.assertEqual(str(self.item), self.string[2:])
def test_compare_from_string_and_from_path(self): unicode_content = codecs.open(self.utf8_path, encoding='utf_8').read() iterator = zip(pyvtt.open(self.utf8_path), pyvtt.from_string(unicode_content)) for file_item, string_item in iterator: self.assertEqual(str(file_item), str(string_item))
from collections import UserList except ImportError: from UserList import UserList from copy import copy from itertools import chain from os import linesep from sys import stderr from pyvtt.vttexc import Error, InvalidFile from pyvtt.vttitem import WebVTTItem from pyvtt.compat import str BOMS = ((BOM_UTF32_LE, 'utf_32_le'), (BOM_UTF32_BE, 'utf_32_be'), (BOM_UTF16_LE, 'utf_16_le'), (BOM_UTF16_BE, 'utf_16_be'), (BOM_UTF8, 'utf_8')) CODECS_BOMS = dict((codec, str(bom, codec)) for bom, codec in BOMS) BIGGER_BOM = max(len(bom) for bom, encoding in BOMS) class WebVTTFile(UserList, object): """ WebVTT file descriptor. Provide a pure Python mapping on all metadata. WebVTTFile(items, eol, path, encoding) items -> list of WebVTTItem. Default to []. eol -> str: end of line character. Default to linesep used in opened file if any else to os.linesep. path -> str: path where file will be saved. To open an existant file see
def __str__(self): if self.ordinal < 0: # Represent negative times as zero return str(WebVTTTime.from_ordinal(0)) return self.TIME_PATTERN % tuple(self)
class WebVTTItem(ComparableMixin): """ WebVTTItem(index, start, end, text, position) start, end -> WebVTTTime or coercible. text -> unicode: text content for item. position -> unicode: raw vtt "display coordinates" string """ ITEM_PATTERN = str('%s --> %s%s\n%s\n') TIMESTAMP_SEPARATOR = '-->' def __init__(self, index=0, start=None, end=None, text='', position=''): try: self.index = int(index) except (TypeError, ValueError): # try to cast as int, but it's not mandatory self.index = index self.start = WebVTTTime.coerce(start or 0) self.end = WebVTTTime.coerce(end or 0) self.position = str(position) self.text = str(text) @property def duration(self): return self.end - self.start @property def text_without_tags(self): return re.compile(r'<[^>]*?>').sub('', self.text) @property def text_without_keys(self): return re.compile(r'{[^>]*?}').sub('', self.text) @property def text_without_strange_chars(self): for c in [ "\\i1", "\\i0", "\\b1", "\\b0", "\\b<weight>", "\\u1", "\\u0", "\\s1", "\\s0", "\\bord<size>", "\\xbord<size>", "\\ybord<size>", "\\shad<depth>", "\\xshad<depth>", "\\yshad<depth>" ]: self.text = self.text.replace(c, '') return self.text @property def text_without_trailing_spaces(self): return self.text.strip() @property def characters_per_second(self): characters_count = len(self.text_without_tags.replace('\n', '')) try: return characters_count / (self.duration.ordinal / 1000.0) except ZeroDivisionError: return 0.0 def __str__(self): position = ' %s' % self.position if self.position.strip() else '' return self.ITEM_PATTERN % (self.start, self.end, position, self.text) if is_py2: __unicode__ = __str__ def __str__(self): raise NotImplementedError('Use unicode() instead!') def _cmpkey(self): return (self.start, self.end) def shift(self, *args, **kwargs): """ shift(hours, minutes, seconds, milliseconds, ratio) Add given values to start and end attributes. All arguments are optional and have a default value of 0. """ self.start.shift(*args, **kwargs) self.end.shift(*args, **kwargs) @classmethod def from_string(cls, source): return cls.from_lines(source.splitlines(True)) @classmethod def from_lines(cls, lines): if len(lines) < 2: raise InvalidItem() lines = [l.rstrip() for l in lines] index = None if cls.TIMESTAMP_SEPARATOR not in lines[0]: index = lines.pop(0) start, end, position = cls.split_timestamps(lines[0]) body = '\n'.join(lines[1:]) return cls(index, start, end, body, position) @classmethod def split_timestamps(cls, line): timestamps = line.split(cls.TIMESTAMP_SEPARATOR) if len(timestamps) != 2: raise InvalidItem() start, end_and_position = timestamps end_and_position = end_and_position.lstrip().split(' ', 1) end = end_and_position[0] position = end_and_position[1] if len(end_and_position) > 1 else '' return (s.strip() for s in (start, end, position))