Пример #1
0
    def __init__(self, xml_file_name_or_src):
        """
        Read and parse xml file using modified version of standard python
        xml.etree.ElementTree.

        xml_file_name_or_src can be a file name like: "content.xml" OR
        can be xml source.
        """
        #xml_file_name_or_src = xml_file_name_or_src.decode('utf-8')

        if xml_file_name_or_src.endswith(
                '.xml') and len(xml_file_name_or_src) < 256:
            self.xml_file_name_or_src = xml_file_name_or_src

            fInp = io.open(xml_file_name_or_src, 'rt', encoding='utf-8')
            xml_src = fInp.read()
            fInp.close()
        else:
            self.xml_file_name_or_src = None
            xml_src = xml_file_name_or_src

        self.xml_header = ''  # Assume no header unless found at head of file
        match = header_re.match(xml_src)
        if match:
            #print( 'Found XML Header: ' + match.group(0) )
            self.xml_header = match.group(0)  # will need \n when serialized

        # ns entries like: ('urn:oasis:names:tc:opendocument:xmlns:table:1.0', u'table')
        self.nsOD = OrderedDict()

        # rev_ns entries like: (u'table', 'urn:oasis:names:tc:opendocument:xmlns:table:1.0')
        self.rev_nsOD = OrderedDict()

        # qname entries like: ('{urn:oasis:names:tc:opendocument:xmlns:office:1.0}document-content', u'office:document-content')
        self.qnameOD = OrderedDict()

        events = ("start", "end", "start-ns", "end-ns")
        context = ET.iterparse(StringIO(xml_src), events=events)

        for event, elem in context:
            if event == "start":
                #print('elem.tag =', elem.tag)
                #print('type elem.tag =', type(elem.tag))
                #print('     type("}") =',type("}"))
                sL = elem.tag.split('}')
                if len(sL) == 2:
                    name = sL[1]
                    uri = sL[0][1:]
                    self.qnameOD[elem.tag] = '%s:%s' % (self.nsOD[uri], name)

                for qname, v in list(elem.attrib.items()):
                    sL = qname.split('}')
                    if len(sL) == 2:
                        name = sL[1]
                        uri = sL[0][1:]
                        self.qnameOD[qname] = '%s:%s' % (self.nsOD[uri], name)
            if event == "start-ns":
                self.nsOD[elem[1]] = elem[
                    0]  # like: ('urn:oasis:names:tc:opendocument:xmlns:table:1.0', u'table')
                self.rev_nsOD[elem[0]] = elem[
                    1]  # like: (u'table', 'urn:oasis:names:tc:opendocument:xmlns:table:1.0')

        self.context = context
        #self.root = ET.ElementTree( context.root )
        self.root = context.root

        self.parentD = {
        }  # index=child Element object, value=parent Element object
        self.depthD = {}  # index=Element object, value = depth in xml tree
        self.original_posD = {
        }  # index=Element object, value=tuple of child position (e.g. (0,3,1))
        self.get_elem_from_orig_posD = {}  # reverse lookup of "original_posD"

        self.max_depth = 0
        self.short_pathD = {
        }  # index=Element, value = short name (like: "ns0:name1/ns1:xyz/ns3:abc")
        # After building tree, create self.parentD for all Elements
        self.parentD[self.root] = None
        self.depthD[self.root] = 0
        self.short_pathD[self.root] = self.qnameOD[
            self.root.tag]  # no calc req'd... just = qname

        self.original_posD[self.root] = (0, )  # tuple of position

        temp_short_path_counterD = {
        }  # just used here to help count occurances of short path
        self.short_path_counterD = {
        }  # index=Element, value=short path counter value
        self.short_path_parent_counterD = {
        }  #  index=Element, value=parent's short path counter value
        self.short_path_counterD[self.root] = 1  # 1st (and only) occurance
        self.short_path_parent_counterD[
            self.root] = 1  # 1st (and only) occurance

        for parent in self.root.iter():
            try:
                for ichild, child in enumerate(parent.getchildren()):
                    self.parentD[child] = parent
                    self.depthD[child] = self.depthD[parent] + 1
                    self.max_depth = max(self.max_depth, self.depthD[child])

                    L = list(self.original_posD[parent])
                    L.append(ichild)
                    self.original_posD[child] = tuple(L)

                    short_path = self.get_short_path(child)
                    self.short_pathD[child] = short_path

                    temp_short_path_counterD[(
                        parent, short_path)] = temp_short_path_counterD.get(
                            (parent, short_path), 0) + 1
                    self.short_path_counterD[child] = '%s,%s' % (
                        self.short_path_counterD[parent],
                        temp_short_path_counterD[(parent, short_path)])
                    self.short_path_parent_counterD[
                        child] = '%s' % self.short_path_counterD[parent]

            except:
                print('NOTICE: No children for:', parent)

        for key, item in list(self.original_posD.items()):
            self.get_elem_from_orig_posD[
                item] = key  # get elem from original_posD

        # set up dictionaries to hold style:name info (if init_all_annn_style8name)
        self.annn_style8nameD = {
        }  # index=style:name ("a123"), value=style elem
        self.style_refD = {
        }  # index=xxxxx:style-name (e.g. "a123"), value=elem with ref (not style itself)
        self.id_draw8idD = {}  # index=draw:id (e.g. "id123"), value=elem
Пример #2
0
    def __init__(self, xml_file_name_or_src):
        """
        Read and parse xml file using modified version of standard python
        xml.etree.ElementTree.

        xml_file_name_or_src can be a file name like: "content.xml" OR
        can be xml source.
        """
        #xml_file_name_or_src = xml_file_name_or_src.decode('utf-8')
        
        if xml_file_name_or_src.endswith('.xml') and len(xml_file_name_or_src)<256:
            self.xml_file_name_or_src = xml_file_name_or_src

            fInp = io.open(xml_file_name_or_src, 'rt', encoding='utf-8')
            xml_src = fInp.read()
            fInp.close()
        else:
            self.xml_file_name_or_src = None
            xml_src = xml_file_name_or_src

        self.xml_header = '' # Assume no header unless found at head of file
        match = header_re.match(xml_src)
        if match:
            #print( 'Found XML Header: ' + match.group(0) )
            self.xml_header = match.group(0) # will need \n when serialized

        # ns entries like: ('urn:oasis:names:tc:opendocument:xmlns:table:1.0', u'table')
        self.nsOD = OrderedDict()

        # rev_ns entries like: (u'table', 'urn:oasis:names:tc:opendocument:xmlns:table:1.0')
        self.rev_nsOD = OrderedDict()

        # qname entries like: ('{urn:oasis:names:tc:opendocument:xmlns:office:1.0}document-content', u'office:document-content')
        self.qnameOD = OrderedDict()

        events = ("start", "end", "start-ns", "end-ns")
        context = ET.iterparse(StringIO(xml_src), events=events)

        for event, elem in context:
            if event=="start":
                #print('elem.tag =', elem.tag)
                #print('type elem.tag =', type(elem.tag))
                #print('     type("}") =',type("}"))
                sL = elem.tag.split('}')
                if len(sL) == 2:
                    name = sL[1]
                    uri = sL[0][1:]
                    self.qnameOD[elem.tag] = '%s:%s'%(self.nsOD[uri], name)

                for qname,v in elem.attrib.items():
                    sL = qname.split('}')
                    if len(sL) == 2:
                        name = sL[1]
                        uri = sL[0][1:]
                        self.qnameOD[qname] = '%s:%s'%(self.nsOD[uri], name)
            if event=="start-ns":
                self.nsOD[elem[1]] = elem[0]     # like: ('urn:oasis:names:tc:opendocument:xmlns:table:1.0', u'table')
                self.rev_nsOD[elem[0]] = elem[1] # like: (u'table', 'urn:oasis:names:tc:opendocument:xmlns:table:1.0')

        self.context = context
        #self.root = ET.ElementTree( context.root )
        self.root = context.root

        self.parentD = {} # index=child Element object, value=parent Element object
        self.depthD = {}  # index=Element object, value = depth in xml tree
        self.original_posD = {}  # index=Element object, value=tuple of child position (e.g. (0,3,1))
        self.get_elem_from_orig_posD = {}  # reverse lookup of "original_posD"
        
        self.max_depth = 0
        self.short_pathD = {} # index=Element, value = short name (like: "ns0:name1/ns1:xyz/ns3:abc")
        # After building tree, create self.parentD for all Elements
        self.parentD[self.root] = None
        self.depthD[self.root] = 0
        self.short_pathD[self.root] = self.qnameOD[ self.root.tag ] # no calc req'd... just = qname
        
        self.original_posD[self.root] = (0,) # tuple of position

        temp_short_path_counterD = {} # just used here to help count occurances of short path
        self.short_path_counterD = {} # index=Element, value=short path counter value
        self.short_path_parent_counterD = {} #  index=Element, value=parent's short path counter value
        self.short_path_counterD[self.root] = 1 # 1st (and only) occurance
        self.short_path_parent_counterD[self.root] = 1 # 1st (and only) occurance

        for parent in self.root.iter():
            try:
                for ichild, child in enumerate(parent.getchildren()):
                    self.parentD[child] = parent
                    self.depthD[child] = self.depthD[parent] + 1
                    self.max_depth = max(self.max_depth, self.depthD[child])
                    
                    L = list(self.original_posD[parent])
                    L.append( ichild )
                    self.original_posD[child] = tuple( L )

                    short_path = self.get_short_path( child )
                    self.short_pathD[child] = short_path

                    temp_short_path_counterD[(parent,short_path)] = temp_short_path_counterD.get((parent,short_path), 0) + 1
                    self.short_path_counterD[child] = '%s,%s'%(self.short_path_counterD[parent],
                                                       temp_short_path_counterD[(parent,short_path)])
                    self.short_path_parent_counterD[child] = '%s'%self.short_path_counterD[parent]

            except:
                print( 'NOTICE: No children for:', parent )
                
        for key,item in self.original_posD.items():
            self.get_elem_from_orig_posD[item] = key # get elem from original_posD
            
            
        # set up dictionaries to hold style:name info (if init_all_annn_style8name)
        self.annn_style8nameD = {} # index=style:name ("a123"), value=style elem
        self.style_refD = {} # index=xxxxx:style-name (e.g. "a123"), value=elem with ref (not style itself)
        self.id_draw8idD = {} # index=draw:id (e.g. "id123"), value=elem