def invoke(self, packet): if packet.data is None: log.info("No XML file given") return packet if self.cur_file_path is None: self.cur_file_path = packet.data event = None packet.data = None if self.context is None: # Open file fd = open(self.cur_file_path) self.elem_count = 0 log.info("file opened : %s" % self.cur_file_path) self.context = etree.iterparse(fd, events=("start", "end")) self.context = iter(self.context) event, self.root = self.context.next() packet = self.process_xml(packet) return packet
def read(self, packet): event = None packet.data = None if self.context is None: if not len(self.file_list): # No more files left, all done log.info("No more files left") return packet # Files available: pop next file self.cur_file_path = self.file_list.pop(0) file = open(self.cur_file_path) self.elem_count = 0 log.info("file opened : %s" % self.cur_file_path) self.context = etree.iterparse(file, events=("start", "end")) self.context = iter(self.context) event, self.root = self.context.next() try: event, elem = self.context.next() except StopIteration, e: self.context = None
def read(self, packet): event = None packet.data = None if self.context is None: if not len(self.file_list): # No more files left, all done log.info("No more files left") return packet # Files available: pop next file self.cur_file_path = self.file_list.pop(0) fd = open(self.cur_file_path) self.elem_count = 0 log.info("file opened : %s" % self.cur_file_path) self.context = etree.iterparse(fd, events=("start", "end")) self.context = iter(self.context) event, self.root = self.context.next() try: event, elem = self.context.next() except (etree.XMLSyntaxError, StopIteration): # workaround for etree.XMLSyntaxError https://bugs.launchpad.net/lxml/+bug/1185701 self.context = None if self.context is None: # Always end of doc packet.set_end_of_doc() log.info("End of doc: %s elem_count=%d" % (self.cur_file_path, self.elem_count)) # Maybe end of stream (all docs done) if not len(self.file_list): # No more files left: end of stream packet.set_end_of_stream() log.info("End of stream") return packet # Filter out Namespace from the tag # this is the easiest way to go for now tag = elem.tag.split('}') if len(tag) == 2: # Namespaced tag: 2nd is tag tag = tag[1] else: # Non-namespaced tag: first tag = tag[0] if tag in self.element_tags: if event == "start": # TODO check if deepcopy is the right thing to do here. # packet.data = elem pass # self.root.remove(elem) elif event == "end": # Delete the element from the tree # self.root.clear() packet.data = elem self.elem_count += 1 self.root.remove(elem) if self.strip_namespaces: packet.data = Util.stripNamespaces(elem).getroot() return packet