def _read_smil(self, input_file): """ Read from SMIL file. Limitations: 1. parses only <par> elements, in order 2. timings must have hh:mm:ss.mmm or ss.mmm format (autodetected) 3. both clipBegin and clipEnd attributes of <audio> must be populated """ smil_ns = "{http://www.w3.org/ns/SMIL}" contents = input_file.read() root = etree.fromstring(contents.encode("utf-8")) for par in root.iter(smil_ns + "par"): for child in par: if child.tag == (smil_ns + "text"): identifier = gf.split_url(child.get("src"))[1] elif child.tag == (smil_ns + "audio"): begin = gf.time_from_hhmmssmmm(child.get("clipBegin")) if begin is None: begin = gf.time_from_ssmmm(child.get("clipBegin")) end = gf.time_from_hhmmssmmm(child.get("clipEnd")) if end is None: end = gf.time_from_ssmmm(child.get("clipEnd")) # TODO read text from additional text_file? text = u"" text_fragment = TextFragment(identifier=identifier, lines=[text]) sm_fragment = SyncMapFragment(text_fragment, begin, end) self.append(sm_fragment)
def parse(self, input_text, syncmap): """ Read from SMIL file. Limitations: 1. parses only ``<par>`` elements, in order 2. timings must have ``hh:mm:ss.mmm`` or ``ss.mmm`` format (autodetected) 3. both ``clipBegin`` and ``clipEnd`` attributes of ``<audio>`` must be populated """ from lxml import etree smil_ns = "{http://www.w3.org/ns/SMIL}" root = etree.fromstring(gf.safe_bytes(input_text)) for par in root.iter(smil_ns + "par"): for child in par: if child.tag == (smil_ns + "text"): identifier = gf.safe_unicode(gf.split_url(child.get("src"))[1]) elif child.tag == (smil_ns + "audio"): begin_text = child.get("clipBegin") if ":" in begin_text: begin = gf.time_from_hhmmssmmm(begin_text) else: begin = gf.time_from_ssmmm(begin_text) end_text = child.get("clipEnd") if ":" in end_text: end = gf.time_from_hhmmssmmm(end_text) else: end = gf.time_from_ssmmm(end_text) # TODO read text from additional text_file? self._add_fragment( syncmap=syncmap, identifier=identifier, lines=[u""], begin=begin, end=end )
def parse(self, input_text, syncmap): contents_dict = json.loads(input_text) for fragment in contents_dict["smil_data"]: # TODO read text from additional text_file? self._add_fragment(syncmap=syncmap, identifier=fragment["id"], lines=[u""], begin=gf.time_from_ssmmm(fragment["begin"]), end=gf.time_from_ssmmm(fragment["end"]))
def parse(self, input_text, syncmap): contents_dict = json.loads(input_text) for fragment in contents_dict["smil_data"]: # TODO read text from additional text_file? self._add_fragment( syncmap=syncmap, identifier=fragment["id"], lines=[u""], begin=gf.time_from_ssmmm(fragment["begin"]), end=gf.time_from_ssmmm(fragment["end"]) )
def parse(self, input_text, syncmap): contents_dict = json.loads(input_text) for fragment in contents_dict["fragments"]: self._add_fragment( syncmap=syncmap, identifier=fragment["id"], language=fragment["language"], lines=fragment["lines"], begin=gf.time_from_ssmmm(fragment["begin"]), end=gf.time_from_ssmmm(fragment["end"]) )
def _read_rbse(self, input_file): """ Read from RBSE file """ contents = input_file.read() contents_dict = json.loads(contents) for fragment in contents_dict["smil_data"]: identifier = fragment["id"] begin = gf.time_from_ssmmm(fragment["begin"]) end = gf.time_from_ssmmm(fragment["end"]) # TODO read text from additional text_file? text = u"" text_fragment = TextFragment(identifier=identifier, lines=[text]) sm_fragment = SyncMapFragment(text_fragment, begin, end) self.append(sm_fragment)
def parse(self, input_text, syncmap): from lxml import etree # namespaces xsi = "http://www.w3.org/2001/XMLSchema-instance" ns_map = {"xsi": xsi} # get root root = etree.fromstring(gf.safe_bytes(input_text)) # get time slots time_slots = dict() for ts in root.iter("TIME_SLOT"): time_slots[ts.get("TIME_SLOT_ID")] = gf.time_from_ssmmm( ts.get("TIME_VALUE")) / 1000 # parse annotations for alignable in root.iter("ALIGNABLE_ANNOTATION"): identifier = gf.safe_unicode(alignable.get("ANNOTATION_ID")) begin = time_slots[alignable.get("TIME_SLOT_REF1")] end = time_slots[alignable.get("TIME_SLOT_REF2")] lines = [] for value in alignable.iter("ANNOTATION_VALUE"): lines.append(gf.safe_unicode(value.text)) self._add_fragment(syncmap=syncmap, identifier=identifier, lines=lines, begin=begin, end=end)
def parse(self, input_text, syncmap): from lxml import etree # namespaces xsi = "http://www.w3.org/2001/XMLSchema-instance" ns_map = {"xsi": xsi} # get root root = etree.fromstring(gf.safe_bytes(input_text)) # get time slots time_slots = dict() for ts in root.iter("TIME_SLOT"): time_slots[ts.get("TIME_SLOT_ID")] = gf.time_from_ssmmm(ts.get("TIME_VALUE")) / 1000 # parse annotations for alignable in root.iter("ALIGNABLE_ANNOTATION"): identifier = gf.safe_unicode(alignable.get("ANNOTATION_ID")) begin = time_slots[alignable.get("TIME_SLOT_REF1")] end = time_slots[alignable.get("TIME_SLOT_REF2")] lines = [] for value in alignable.iter("ANNOTATION_VALUE"): lines.append(gf.safe_unicode(value.text)) self._add_fragment( syncmap=syncmap, identifier=identifier, lines=lines, begin=begin, end=end )
def _read_xml(self, input_file): """ Read from XML file """ contents = input_file.read() root = etree.fromstring(contents.encode("utf-8")) for frag in root: identifier = frag.get("id") begin = gf.time_from_ssmmm(frag.get("begin")) end = gf.time_from_ssmmm(frag.get("end")) lines = [] for child in frag: if child.tag == "line": lines.append(child.text) text_fragment = TextFragment(identifier=identifier, lines=lines) sm_fragment = SyncMapFragment(text_fragment, begin, end) self.append(sm_fragment)
def _read_json(self, input_file): """ Read from JSON file """ contents = input_file.read() contents_dict = json.loads(contents) for fragment in contents_dict["fragments"]: identifier = fragment["id"] language = fragment["language"] begin = gf.time_from_ssmmm(fragment["begin"]) end = gf.time_from_ssmmm(fragment["end"]) lines = [] for line in fragment["lines"]: lines.append(line) text_fragment = TextFragment(identifier=identifier, language=language, lines=lines) sm_fragment = SyncMapFragment(text_fragment, begin, end) self.append(sm_fragment)
def parse(self, input_text, syncmap): from lxml import etree root = etree.fromstring(gf.safe_bytes(input_text)) for frag in root: for child in frag: if child.tag == "identifier": identifier = gf.safe_unicode(child.text) elif child.tag == "start": begin = gf.time_from_ssmmm(child.text) elif child.tag == "end": end = gf.time_from_ssmmm(child.text) # TODO read text from additional text_file? self._add_fragment(syncmap=syncmap, identifier=identifier, lines=[u""], begin=begin, end=end)
def parse(self, input_text, syncmap): from lxml import etree root = etree.fromstring(gf.safe_bytes(input_text)) for frag in root: identifier = gf.safe_unicode(frag.get("id")) begin = gf.time_from_ssmmm(frag.get("begin")) end = gf.time_from_ssmmm(frag.get("end")) lines = [] for child in frag: if child.tag == "line": lines.append(gf.safe_unicode(child.text)) self._add_fragment( syncmap=syncmap, identifier=identifier, lines=lines, begin=begin, end=end )
def _read_xml_legacy(self, input_file): """ Read from XML file (legacy format) """ contents = input_file.read() root = etree.fromstring(contents.encode("utf-8")) for frag in root: for child in frag: if child.tag == "identifier": identifier = child.text elif child.tag == "start": begin = gf.time_from_ssmmm(child.text) elif child.tag == "end": end = gf.time_from_ssmmm(child.text) # TODO read text from additional text_file? text = "" text_fragment = TextFragment(identifier=identifier, lines=[text]) sm_fragment = SyncMapFragment(text_fragment, begin, end) self.append(sm_fragment)
def parse(self, input_text, syncmap): from lxml import etree root = etree.fromstring(gf.safe_bytes(input_text)) for frag in root: for child in frag: if child.tag == "identifier": identifier = gf.safe_unicode(child.text) elif child.tag == "start": begin = gf.time_from_ssmmm(child.text) elif child.tag == "end": end = gf.time_from_ssmmm(child.text) # TODO read text from additional text_file? self._add_fragment( syncmap=syncmap, identifier=identifier, lines=[u""], begin=begin, end=end )
def test_time_from_ssmmm(self): tests = [ [None, 0], ["", 0], ["0", 0], ["000", 0], ["1", 1], ["001", 1], ["1.234", 1.234], ["001.234", 1.234], ] for test in tests: self.assertEqual(gf.time_from_ssmmm(test[0]), test[1])
def test_time_from_ssmmm(self): tests = [ (None, TimeValue("0")), ("", TimeValue("0")), ("0", TimeValue("0")), ("000", TimeValue("0")), ("1", TimeValue("1")), ("001", TimeValue("1")), ("1.234", TimeValue("1.234")), ("001.234", TimeValue("1.234")), ] for test in tests: self.assertEqual(gf.time_from_ssmmm(test[0]), test[1])