示例#1
0
    def _read_smil(self, input_file):
        """
        Read from SMIL file.

        Limitations:
        1. parses only <par> elements, in order
        2. timings must have hh:mm:ss.mmm or ss.mmm format (autodetected)
        3. both clipBegin and clipEnd attributes of <audio> must be populated
        """
        smil_ns = "{http://www.w3.org/ns/SMIL}"
        contents = input_file.read()
        root = etree.fromstring(contents.encode("utf-8"))
        for par in root.iter(smil_ns + "par"):
            for child in par:
                if child.tag == (smil_ns + "text"):
                    identifier = gf.split_url(child.get("src"))[1]
                elif child.tag == (smil_ns + "audio"):
                    begin = gf.time_from_hhmmssmmm(child.get("clipBegin"))
                    if begin is None:
                        begin = gf.time_from_ssmmm(child.get("clipBegin"))
                    end = gf.time_from_hhmmssmmm(child.get("clipEnd"))
                    if end is None:
                        end = gf.time_from_ssmmm(child.get("clipEnd"))
            # TODO read text from additional text_file?
            text = u""
            text_fragment = TextFragment(identifier=identifier, lines=[text])
            sm_fragment = SyncMapFragment(text_fragment, begin, end)
            self.append(sm_fragment)
示例#2
0
    def parse(self, input_text, syncmap):
        """
        Read from SMIL file.

        Limitations:
        1. parses only ``<par>`` elements, in order
        2. timings must have ``hh:mm:ss.mmm`` or ``ss.mmm`` format (autodetected)
        3. both ``clipBegin`` and ``clipEnd`` attributes of ``<audio>`` must be populated
        """
        from lxml import etree
        smil_ns = "{http://www.w3.org/ns/SMIL}"
        root = etree.fromstring(gf.safe_bytes(input_text))
        for par in root.iter(smil_ns + "par"):
            for child in par:
                if child.tag == (smil_ns + "text"):
                    identifier = gf.safe_unicode(gf.split_url(child.get("src"))[1])
                elif child.tag == (smil_ns + "audio"):
                    begin_text = child.get("clipBegin")
                    if ":" in begin_text:
                        begin = gf.time_from_hhmmssmmm(begin_text)
                    else:
                        begin = gf.time_from_ssmmm(begin_text)
                    end_text = child.get("clipEnd")
                    if ":" in end_text:
                        end = gf.time_from_hhmmssmmm(end_text)
                    else:
                        end = gf.time_from_ssmmm(end_text)
            # TODO read text from additional text_file?
            self._add_fragment(
                syncmap=syncmap,
                identifier=identifier,
                lines=[u""],
                begin=begin,
                end=end
            )
示例#3
0
 def parse(self, input_text, syncmap):
     contents_dict = json.loads(input_text)
     for fragment in contents_dict["smil_data"]:
         # TODO read text from additional text_file?
         self._add_fragment(syncmap=syncmap,
                            identifier=fragment["id"],
                            lines=[u""],
                            begin=gf.time_from_ssmmm(fragment["begin"]),
                            end=gf.time_from_ssmmm(fragment["end"]))
示例#4
0
 def parse(self, input_text, syncmap):
     contents_dict = json.loads(input_text)
     for fragment in contents_dict["smil_data"]:
         # TODO read text from additional text_file?
         self._add_fragment(
             syncmap=syncmap,
             identifier=fragment["id"],
             lines=[u""],
             begin=gf.time_from_ssmmm(fragment["begin"]),
             end=gf.time_from_ssmmm(fragment["end"])
         )
示例#5
0
 def parse(self, input_text, syncmap):
     contents_dict = json.loads(input_text)
     for fragment in contents_dict["fragments"]:
         self._add_fragment(
             syncmap=syncmap,
             identifier=fragment["id"],
             language=fragment["language"],
             lines=fragment["lines"],
             begin=gf.time_from_ssmmm(fragment["begin"]),
             end=gf.time_from_ssmmm(fragment["end"])
         )
示例#6
0
 def _read_rbse(self, input_file):
     """
     Read from RBSE file
     """
     contents = input_file.read()
     contents_dict = json.loads(contents)
     for fragment in contents_dict["smil_data"]:
         identifier = fragment["id"]
         begin = gf.time_from_ssmmm(fragment["begin"])
         end = gf.time_from_ssmmm(fragment["end"])
         # TODO read text from additional text_file?
         text = u""
         text_fragment = TextFragment(identifier=identifier, lines=[text])
         sm_fragment = SyncMapFragment(text_fragment, begin, end)
         self.append(sm_fragment)
示例#7
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     # namespaces
     xsi = "http://www.w3.org/2001/XMLSchema-instance"
     ns_map = {"xsi": xsi}
     # get root
     root = etree.fromstring(gf.safe_bytes(input_text))
     # get time slots
     time_slots = dict()
     for ts in root.iter("TIME_SLOT"):
         time_slots[ts.get("TIME_SLOT_ID")] = gf.time_from_ssmmm(
             ts.get("TIME_VALUE")) / 1000
     # parse annotations
     for alignable in root.iter("ALIGNABLE_ANNOTATION"):
         identifier = gf.safe_unicode(alignable.get("ANNOTATION_ID"))
         begin = time_slots[alignable.get("TIME_SLOT_REF1")]
         end = time_slots[alignable.get("TIME_SLOT_REF2")]
         lines = []
         for value in alignable.iter("ANNOTATION_VALUE"):
             lines.append(gf.safe_unicode(value.text))
         self._add_fragment(syncmap=syncmap,
                            identifier=identifier,
                            lines=lines,
                            begin=begin,
                            end=end)
示例#8
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     # namespaces
     xsi = "http://www.w3.org/2001/XMLSchema-instance"
     ns_map = {"xsi": xsi}
     # get root
     root = etree.fromstring(gf.safe_bytes(input_text))
     # get time slots
     time_slots = dict()
     for ts in root.iter("TIME_SLOT"):
         time_slots[ts.get("TIME_SLOT_ID")] = gf.time_from_ssmmm(ts.get("TIME_VALUE")) / 1000
     # parse annotations
     for alignable in root.iter("ALIGNABLE_ANNOTATION"):
         identifier = gf.safe_unicode(alignable.get("ANNOTATION_ID"))
         begin = time_slots[alignable.get("TIME_SLOT_REF1")]
         end = time_slots[alignable.get("TIME_SLOT_REF2")]
         lines = []
         for value in alignable.iter("ANNOTATION_VALUE"):
             lines.append(gf.safe_unicode(value.text))
         self._add_fragment(
             syncmap=syncmap,
             identifier=identifier,
             lines=lines,
             begin=begin,
             end=end
         )
示例#9
0
 def _read_xml(self, input_file):
     """
     Read from XML file
     """
     contents = input_file.read()
     root = etree.fromstring(contents.encode("utf-8"))
     for frag in root:
         identifier = frag.get("id")
         begin = gf.time_from_ssmmm(frag.get("begin"))
         end = gf.time_from_ssmmm(frag.get("end"))
         lines = []
         for child in frag:
             if child.tag == "line":
                 lines.append(child.text)
         text_fragment = TextFragment(identifier=identifier, lines=lines)
         sm_fragment = SyncMapFragment(text_fragment, begin, end)
         self.append(sm_fragment)
示例#10
0
 def _read_json(self, input_file):
     """
     Read from JSON file
     """
     contents = input_file.read()
     contents_dict = json.loads(contents)
     for fragment in contents_dict["fragments"]:
         identifier = fragment["id"]
         language = fragment["language"]
         begin = gf.time_from_ssmmm(fragment["begin"])
         end = gf.time_from_ssmmm(fragment["end"])
         lines = []
         for line in fragment["lines"]:
             lines.append(line)
         text_fragment = TextFragment(identifier=identifier, language=language, lines=lines)
         sm_fragment = SyncMapFragment(text_fragment, begin, end)
         self.append(sm_fragment)
示例#11
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     root = etree.fromstring(gf.safe_bytes(input_text))
     for frag in root:
         for child in frag:
             if child.tag == "identifier":
                 identifier = gf.safe_unicode(child.text)
             elif child.tag == "start":
                 begin = gf.time_from_ssmmm(child.text)
             elif child.tag == "end":
                 end = gf.time_from_ssmmm(child.text)
         # TODO read text from additional text_file?
         self._add_fragment(syncmap=syncmap,
                            identifier=identifier,
                            lines=[u""],
                            begin=begin,
                            end=end)
示例#12
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     root = etree.fromstring(gf.safe_bytes(input_text))
     for frag in root:
         identifier = gf.safe_unicode(frag.get("id"))
         begin = gf.time_from_ssmmm(frag.get("begin"))
         end = gf.time_from_ssmmm(frag.get("end"))
         lines = []
         for child in frag:
             if child.tag == "line":
                 lines.append(gf.safe_unicode(child.text))
         self._add_fragment(
             syncmap=syncmap,
             identifier=identifier,
             lines=lines,
             begin=begin,
             end=end
         )
示例#13
0
 def _read_xml_legacy(self, input_file):
     """
     Read from XML file (legacy format)
     """
     contents = input_file.read()
     root = etree.fromstring(contents.encode("utf-8"))
     for frag in root:
         for child in frag:
             if child.tag == "identifier":
                 identifier = child.text
             elif child.tag == "start":
                 begin = gf.time_from_ssmmm(child.text)
             elif child.tag == "end":
                 end = gf.time_from_ssmmm(child.text)
         # TODO read text from additional text_file?
         text = ""
         text_fragment = TextFragment(identifier=identifier, lines=[text])
         sm_fragment = SyncMapFragment(text_fragment, begin, end)
         self.append(sm_fragment)
示例#14
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     root = etree.fromstring(gf.safe_bytes(input_text))
     for frag in root:
         for child in frag:
             if child.tag == "identifier":
                 identifier = gf.safe_unicode(child.text)
             elif child.tag == "start":
                 begin = gf.time_from_ssmmm(child.text)
             elif child.tag == "end":
                 end = gf.time_from_ssmmm(child.text)
         # TODO read text from additional text_file?
         self._add_fragment(
             syncmap=syncmap,
             identifier=identifier,
             lines=[u""],
             begin=begin,
             end=end
         )
示例#15
0
 def test_time_from_ssmmm(self):
     tests = [
         [None, 0],
         ["", 0],
         ["0", 0],
         ["000", 0],
         ["1", 1],
         ["001", 1],
         ["1.234", 1.234],
         ["001.234", 1.234],
     ]
     for test in tests:
         self.assertEqual(gf.time_from_ssmmm(test[0]), test[1])
 def test_time_from_ssmmm(self):
     tests = [
         (None, TimeValue("0")),
         ("", TimeValue("0")),
         ("0", TimeValue("0")),
         ("000", TimeValue("0")),
         ("1", TimeValue("1")),
         ("001", TimeValue("1")),
         ("1.234", TimeValue("1.234")),
         ("001.234", TimeValue("1.234")),
     ]
     for test in tests:
         self.assertEqual(gf.time_from_ssmmm(test[0]), test[1])
示例#17
0
 def test_time_from_ssmmm(self):
     tests = [
         [None, 0],
         ["", 0],
         ["0", 0],
         ["000", 0],
         ["1", 1],
         ["001", 1],
         ["1.234", 1.234],
         ["001.234", 1.234],
     ]
     for test in tests:
         self.assertEqual(gf.time_from_ssmmm(test[0]), test[1])
示例#18
0
 def test_time_from_ssmmm(self):
     tests = [
         (None, TimeValue("0")),
         ("", TimeValue("0")),
         ("0", TimeValue("0")),
         ("000", TimeValue("0")),
         ("1", TimeValue("1")),
         ("001", TimeValue("1")),
         ("1.234", TimeValue("1.234")),
         ("001.234", TimeValue("1.234")),
     ]
     for test in tests:
         self.assertEqual(gf.time_from_ssmmm(test[0]), test[1])