def process(fs, xmldata): """ @param fs: a fieldstorage-like object @param xmldata: contents of an xml file as a multiline string @return: contents of the new xml file as a multiline string """ # get user input start_pos = fs.start_pos stop_pos = fs.stop_pos nsteps = fs.nsteps alignment_id = fs.alignment_id # read the xml tree tree = etree.parse(StringIO(xmldata)) # modify the number of mcmc steps for event, element in etree.iterwalk(tree, tag='mcmc'): if element.get('id') == fs.mcmc_id: element.set('chainLength', str(fs.nsteps)) # modify the sequences within the alignment for event, element in etree.iterwalk(tree, tag='alignment'): if element.get('id') == fs.alignment_id: for seq_element in element: if seq_element.tag != 'sequence': continue for taxon_element in seq_element: if taxon_element.tag != 'taxon': continue modify_taxon_sequence( taxon_element, start_pos, stop_pos) # write the xml tree return etree.tostring(tree)
def __init__(self, xml): """ DOKU """ einsatz_id = 0 try: xml_root = etree.fromstring(xml) context = etree.iterwalk(xml_root, events=("start",)) # Alle Attribute des XML-Files werden durchgelaufen for action,elem in context: if elem.tag != "root": if elem.tag == "einsatz": einsatzobj = EinsatzKlasse() einsatz_id = elem.get("id") setattr(einsatzobj, elem.tag, elem.get("id")) elif elem.tag == "einsatznr": setattr(einsatzobj,elem.tag,elem.text) einsatzobj.save() elif elem.tag == "dispo": dispoobj = DispoKlasse() setattr(dispoobj, "einsatz", einsatz_id) setattr(dispoobj, elem.tag,elem.get("id")) elif elem.tag == "disponame" or elem.tag == "zeitdispo" or elem.tag =="zeitalarm" or elem.tag =="zeitaus" or elem.tag =="zeitein": setattr(dispoobj, elem.tag,elem.text) elif elem.tag == "hintergrund": setattr(dispoobj, elem.tag, elem.text) dispoobj.save() elif elem.text: setattr(einsatzobj,elem.tag,elem.text) #Suche nicht abgeschlossene Einsaetze und schliesse sie ggfls. ab unabgeschl = EinsatzModel.objects.filter(abgeschlossen=False, selbst_erstellt=False) # Alle unabgeschlossenen Einsätze werden durchgelaufen for unab in unabgeschl: close = 0 xml_root = etree.fromstring(xml) context = etree.iterwalk(xml_root, events=("start",)) for action,elem in context: if elem.tag == "einsatz": if elem.get("id") == unab.einsatz: close = 1 # Befindet sich ein nicht abgeschlossener Einsatz nicht in dem XML-File, wird dieser abgeschlossen if close == 0: closeeins = EinsatzKlasse() closeeins.closeeinsatz(unab.einsatz) except: pass
def to_xml(self, path, *args, **kwargs): """Write xml file to path (adds .xml extension if none provided)""" if len(args) == 0: schema = Root() else: schema = Section(*args) entries = self.get_content(schema) xml = self.content_to_xml(schema.to_xml(), entries) # Bilingual fields are structured extremely weird and have to be post-processed for _, field_xml in etree.iterwalk(xml, tag="field"): child = list(field_xml)[0] if child.get("type") == "Bilingual": english = child[0].text french = child[1].text child.remove(child[1]) child.remove(child[0]) child.text = english field_xml.append(etree.Element("bilingual")) field_xml[1].append(etree.Element("french")) field_xml[1][0].text = french field_xml[1].append(etree.Element("english")) field_xml[1][1].text = english if not path.endswith(".xml"): path = path + ".xml" f = open(path, 'wb') with f: f.write(b'<?xml version="1.0" encoding="UTF-8"?>\n') f.write(etree.tostring(xml, **kwargs))
def __init__(self, xml_path=None, language="english"): self._index = {} self._content = {} self.language = language self.log = logging.getLogger("CCV") if xml_path is not None: f = open(xml_path, 'rb') with f: content = f.read() xml = etree.XML(content) msg = '# Importing existing entries from "%s" #' self.log.info(msg, xml_path) # Re-mapping existing sections according to specified schema for _, section_xml in etree.iterwalk(xml, tag="section"): # If any parents have fields, do not move section = XML(section_xml, language) section = Section(section.id) if section.is_container: self.get_container(section) elif not section.is_dependent: self.add_content(self.parse_xml(section_xml), section) msg = '# Finished importing #' self.log.info(msg)
def render_xml(path, template_name, remove_empty, **nfe): nfe = recursively_normalize(nfe) env = Environment( loader=FileSystemLoader(path), extensions=['jinja2.ext.with_']) env.filters["normalize"] = filters.strip_line_feed env.filters["normalize_str"] = filters.normalize_str env.filters["format_percent"] = filters.format_percent env.filters["format_datetime"] = filters.format_datetime env.filters["format_date"] = filters.format_date env.filters["comma"] = filters.format_with_comma template = env.get_template(template_name) xml = template.render(**nfe) parser = etree.XMLParser(remove_blank_text=True, remove_comments=True, strip_cdata=False) root = etree.fromstring(xml, parser=parser) if remove_empty: context = etree.iterwalk(root) for dummy, elem in context: parent = elem.getparent() if recursively_empty(elem): parent.remove(elem) return root return etree.tostring(root)
def render_xml(path, template_name, remove_empty, **nfe): env = Environment(loader=FileSystemLoader(path), extensions=['jinja2.ext.with_']) env.filters["normalize"] = filters.strip_line_feed env.filters["normalize_str"] = filters.normalize_str env.filters["format_percent"] = filters.format_percent env.filters["format_datetime"] = filters.format_datetime env.filters["format_date"] = filters.format_date template = env.get_template(template_name) xml = template.render(**nfe) parser = etree.XMLParser(remove_blank_text=True, remove_comments=True, strip_cdata=False) root = etree.fromstring(xml, parser=parser) if remove_empty: context = etree.iterwalk(root) for dummy, elem in context: parent = elem.getparent() if recursively_empty(elem): parent.remove(elem) return root return etree.tostring(root)
def salesforce_encoding(xdoc): r = xml_encoding if SF_NS in xdoc.getroot().tag: xdoc.getroot().attrib["xmlns"] = SF_NS for action, elem in ET.iterwalk(xdoc, events=("start", "end", "start-ns", "end-ns", "comment")): if action == "start-ns": pass # handle this nicely if SF starts using multiple namespaces elif action == "start": tag = elem.tag if "}" in tag: tag = tag.split("}")[1] text = (escape(elem.text, { "'": "'", '"': """ }) if elem.text is not None else "") attrs = "".join([f' {k}="{v}"' for k, v in elem.attrib.items()]) if not has_content(elem): r += f"<{tag}{attrs}/>" else: r += f"<{tag}{attrs}>{text}" elif action == "end" and has_content(elem): tag = elem.tag if "}" in tag: tag = tag.split("}")[1] tail = elem.tail if elem.tail else "\n" r += f"</{tag}>{tail}" elif action == "comment": r += str(elem) + (elem.tail if elem.tail else "") return r
def clearEmptyElements(self, root): """ Remove root's empty children. """ context = ET.iterwalk(root) for _, elem in context: parent = elem.getparent() if parent is not None and self.isRecursivelyEmpty(elem): parent.remove(elem)
def xml2simpleconfig(flt_ctxt, in_obj): """Mapping (almost bijective) XML -> simpleconfig Inverse mapping cannot be generally loseless, as XML cannot contain repeated attributes, which seems/is valid with simpleconfig. See `simpleconfig` docstring for details about the target representation. """ # using similar trick of stack emulation in-place as command.analyse_chain. # but this is iterative rather than recursive :) root = [] for action, e in etree.iterwalk(in_obj('etree'), events=('start', 'end')): if action == 'start': #print(">>> start", e.tag, root) current = [e.tag, None, None] root.append(current) current[1] = tuple(sorted(iter_items(e.attrib))) #print("<<< start", e.tag, root) elif action == 'end': #print(">>> end", e.tag, root) if len(root) == 1: #assert id(e) == id(root) break current = root.pop() if root[-1][2] is None: root[-1][2] = [] root[-1][2].append(current) #print("<<< end", e.tag, root) return ('struct', root[-1])
def get_email(self): def get_node_content(element): return (element.text + "".join(map(etree.tostring, element))).strip() subject, body = None, None for _, element in etree.iterwalk(self.template, tag="block"): if (element.get("onstate", None) == self.status): subject = get_node_content(element.find("subject")) body = get_node_content(element.find("body")) elif ((element.get("afterstate", None) == self.status) and (element.get("time", None) == self.time_string)): subject = get_node_content(element.find("subject")) body = get_node_content(element.find("body")) element.clear() # get defaults if not subject: subject = "Item Notification: %s" % (self.message["title"]) log.warn("Missing subject template for %s:%s. Using default" % (self.message["type"], self.message["status"])) if not body: body = "Item, %s, type: %s, status: %s" % ( self.message["title"], self.message["type"], self.message["status"]["value"]) log.warn("Missing body template for %s:%s. Using default" % (self.message["type"], self.message["status"])) subject_template = EmailTemplate() body_template = EmailTemplate() subject_template.write(subject) body_template.write(body) return (subject_template(item=Item(self.message)), body_template(item=Item(self.message)))
def create_svg(self, template, file_out, bone): """ """ colors = {} colors[0] = 'fill:#ffffff' colors[1] = 'fill:#e1e1e1' colors[2] = 'fill:#808080' colors[3] = 'fill:#4b4b4b' colors[4] = 'fill:#000000' doc = etree.parse('svg/' + template) for action, el in etree.iterwalk(doc): id = el.attrib.get('id') if id != None: id = self.clear_id(id) if id in bone: attributes = el.attrib if bone[id] != None and bone[id] >= 0: attributes["style"] = colors[bone[id]] + ";fill-opacity:1" if len(el) > 0: for item in el: item_attr = item.attrib styl = item_attr["style"] if "fill:#ffffff" in styl: styl = styl.replace('fill:#ffffff', colors[bone[id]]) item_attr["style"] = styl with open(file_out + '.svg', 'w') as f: f.write('<?xml version="1.0" encoding="UTF-8"?>\n' + etree.tostring(doc, pretty_print=True).decode('utf-8')) drawing = svg2rlg(file_out + '.svg')
def check(self): name_contest_id = {} # Mapping for <Name> and its Contest ObjectId. error_log = [] for event, element in etree.iterwalk(self.election_tree): tag = self.strip_schema_ns(element) if tag != "Contest": continue object_id = element.get("objectId", None) name = element.find("Name") if name is None or not name.text: error_message = "Contest {0} is missing a <Name> ".format( object_id) error_log.append(base.ErrorLogEntry( element.sourceline, error_message)) continue name_contest_id.setdefault(name.text, []).append(object_id) """Add names and its objectId as key and list of values. Ideally 1 objectId. If duplicates are found, then list of multiple objectIds.""" for name, contests in name_contest_id.items(): if len(contests) > 1: error_message = ("Contest name '{0}' appears in following {1} contests: {2}".format( name, len(contests), ", ".join(contests))) error_log.append(base.ErrorLogEntry(None, error_message)) if error_log: raise base.ElectionTreeError( "The Election File contains duplicate contest names.", error_log)
def main(): xml_input = sys.argv[1] # build/gtestresults.xml xsl_templ = sys.argv[2] # sonar_test.xsl xmldoc = None transform = None with open(xml_input) as f: xmldoc = etree.parse(f) with open(xsl_templ) as xsltfile: xslt_root = etree.XML(xsltfile.read()) transform = etree.XSLT(xslt_root) result_tree = transform(xmldoc) #print(result_tree) context = etree.iterwalk(result_tree, tag="file") for action, elem in context: testcase = elem[0] classname = testcase.attrib["classname"] casename = testcase.attrib["name"] real_file_path = get_file_path(classname, casename) if len(real_file_path) > 0: elem.attrib["path"] = real_file_path print(result_tree)
def get_email(self): def get_node_content(element): return (element.text + "".join(map(etree.tostring, element))).strip() subject, body = None, None for _, element in etree.iterwalk(self.template, tag="block"): if (element.get("onstate", None) == self.status): subject = get_node_content(element.find("subject")) body = get_node_content(element.find("body")) elif ((element.get("afterstate", None) == self.status) and (element.get("time", None) == self.time_string)): subject = get_node_content(element.find("subject")) body = get_node_content(element.find("body")) element.clear() # get defaults if not subject: subject = "Item Notification: %s" % (self.message["title"]) log.warn("Missing subject template for %s:%s. Using default" % ( self.message["type"], self.message["status"])) if not body: body = "Item, %s, type: %s, status: %s" % (self.message["title"], self.message["type"], self.message["status"]["value"]) log.warn("Missing body template for %s:%s. Using default" % ( self.message["type"], self.message["status"])) subject_template = EmailTemplate() body_template = EmailTemplate() subject_template.write(subject) body_template.write(body) return (subject_template(item=Item(self.message)), body_template(item=Item(self.message)))
def to_pahdb_dict(self, validate=False): """Parses the XML, with or without validation. Args: validate (bool). Defaults to self.valdiate value, but can be overridden. Note: Sets the attribute self.library when successful. Returns: library (dict): Dictionary, with the UIDs as keys, containing the transitions, geometry data, as well as UID metadata, references, comments, and laboratory. """ if self.validate or validate: self.verify_schema() self._context = \ etree.iterwalk(self._tree, events=("start", "end")) else: self._context = \ etree.iterparse(self.filename, events=("start", "end")) self.library = self._tree_to_pahdb_dict() return self.library
def check_rules(self): """Checks all rules.""" try: self.election_tree = etree.parse(self.election_file) except etree.LxmlError as e: print("Fatal Error. XML file could not be parsed. {}".format(e)) self.exception_counts[ElectionError] += 1 self.total_count += 1 return self.register_rules() for rule in self.registry.get("tree", []): try: rule.check() except ElectionException as e: self.exception_handler(rule, e) for _, element in etree.iterwalk(self.election_tree, events=("end",)): tag = self.get_element_class(element) if not tag or tag not in self.registry: continue for element_rule in self.registry[tag]: try: element_rule.check(element) except ElectionException as e: self.exception_handler(element_rule, e)
def iter_types(ctx: Context, html: bytes) -> Iterator[ApiType]: doc = etree.iterwalk( etree.HTML(html), events=("start", ), tag=("h4"), ) sign = ApiTypeSignature() for (action, elem) in doc: if sign.consume(elem) != SIGN_NEXT: continue for sibl in elem.itersiblings(): resp = sign.consume(sibl) if resp == SIGN_MATCHED: ty = extract_type(ctx, sign) ctx.types_repository[ty.name] = ty sign.clear() break if resp == SIGN_STOP: break return ctx.types_repository.values()
def text(self): # TODO: this blob won't allow exact phrase matches across transcript pages. # It might be extended a few words into either adjacent page to allow that. text = '' for event, element in etree.iterwalk(self.xml_tree(), events=('start', 'end')): if element.tag == 'p': if len(element) and element[0].tag == 'runningHead': continue if event == 'start': if element.text: if len(element.text ) < 20 and TranscriptPageJoiner.ignore_p.match( element.text): continue text += element.text else: text += '\n\n' elif event == 'end' and element.tag == 'spkr': if element.text: text += '<span class="speaker">{}</span> '.format( element.text) if element.tail: text += element.tail elif event == 'end' and element.tag in ('evidenceFileDoc', 'exhibitDocDef', 'exhibitDocPros'): if element.text: text += element.text if element.tail: text += element.tail return text
def parseMethod(method_elem, nt): instance_starts = [] method_instances = [] instance_depths = [] prev_action = 'start' curr_depth = 0 context = etree.iterwalk(method_elem, events=('start', 'end'), tag=srcml_tags[nt]) for action, elem in context: if action == 'start': instance_starts.append(elem.sourceline - 1) method_instances.append(elem) if prev_action == 'start': curr_depth += 1 instance_depths.append(curr_depth) elif action == 'end': if prev_action == 'end': curr_depth -= 1 prev_action = action return (method_instances, instance_starts, instance_depths)
def _find_tag_limits(root): START_RE = re.compile(r' __START_(\w+)__ ') END_RE = re.compile(r' __END_(\w+)__ ') starts = list() ends = list() for _, element in etree.iterwalk(root, events=('start', )): tasks = [(element.text, START_RE, starts, False), (element.text, END_RE, ends, False), (element.tail, START_RE, starts, True), (element.tail, END_RE, ends, True)] for text, regexp, storage, is_tail in tasks: if not text: continue for match in regexp.finditer(text): if not match: continue storage.append( _TagPosition(element=element, tag=match.group(1), position=match.start(), length=match.end() - match.start(), is_tail=is_tail, dfs_number=-1)) return starts, ends
def render_xml(path, template_name, remove_empty, **nfe): nfe = recursively_normalize(nfe) env = Environment(loader=FileSystemLoader(path), extensions=['jinja2.ext.with_']) env.filters["normalize"] = filters.strip_line_feed env.filters["normalize_str"] = filters.normalize_str env.filters["format_percent"] = filters.format_percent env.filters["format_datetime"] = filters.format_datetime env.filters["format_date"] = filters.format_date env.filters["comma"] = filters.format_with_comma template = env.get_template(template_name) xml = template.render(**nfe) parser = etree.XMLParser(remove_blank_text=True, remove_comments=True, strip_cdata=False) root = etree.fromstring(xml, parser=parser) if remove_empty: context = etree.iterwalk(root) for dummy, elem in context: parent = elem.getparent() if recursively_empty(elem): parent.remove(elem) return root for element in root.iter("*"): # remove espaços em branco if element.text is not None and not element.text.strip(): element.text = None return etree.tostring(root, encoding=str)
def html_to_template_text(unsafe_html, context=None): """ Parse html and turn it into template text. """ # TODO: factor out parsing/serializing safe_html = sanitize_intermediate(unsafe_html) top_level_elements = fragments_fromstring(safe_html) # put top level elements in container container = etree.Element('div') if top_level_elements and not hasattr(top_level_elements[0], 'tag'): container.text = top_level_elements.pop(0) container.extend(top_level_elements) tree = etree.iterwalk(container, events=('end',)) # walk over all elements for action, elem in tree: if not elem.tag in tag_handlers: continue for handler in tag_handlers[elem.tag]: can_continue = handler(elem, context) if can_continue is False: break template_bits = [etree.tostring(elem, encoding='UTF-8') for elem in container] return sanitize_final(''.join(tag_imports + [escape(container.text or '')] + template_bits ) )
def xml2nodes(xml_content: Union[str, bytes]): if isinstance(xml_content, str): xml_content = xml_content.encode("utf-8") root = etree.fromstring(xml_content) nodes = [] for _, n in etree.iterwalk(root): attrib = dict(n.attrib) if "bounds" in attrib: bounds = re.findall(r"(\d+)", attrib.pop("bounds")) if len(bounds) != 4: continue lx, ly, rx, ry = map(int, bounds) attrib['size'] = (rx - lx, ry - ly) attrib.pop("index", None) ok = False for attrname in ("text", "resource-id", "content-desc"): if attrname in attrib: ok = True break if ok: items = [] for k, v in sorted(attrib.items()): items.append(k + ":" + str(v)) nodes.append('|'.join(items)) return nodes
def check_rules(self): """Checks all rules.""" try: self.schema_tree = etree.parse(self.schema_file) self.election_tree = etree.parse(self.election_file) except etree.LxmlError as e: exp = loggers.ElectionFatal.from_message( "Fatal Error. XML file could not be parsed. {}".format(e)) self.exceptions_wrapper.exception_handler(exp) return self.register_rules() for rule in self.registry.get("tree", []): try: rule.check() except loggers.ElectionException as e: rule_name = rule.__class__.__name__ self.exceptions_wrapper.exception_handler(e, rule_name) for _, element in etree.iterwalk(self.election_tree, events=("end",)): tag = self.get_element_class(element) if not tag or tag not in self.registry: continue for element_rule in self.registry[tag]: try: element_rule.check(element) except loggers.ElectionException as e: rule_name = element_rule.__class__.__name__ self.exceptions_wrapper.exception_handler(e, rule_name)
def check_rules(self): """Checks all rules. Returns: 0 if no warnings or errors are generated. 1 otherwise. Args: detailed:if True prints detailed error messages """ try: election_tree = etree.parse(self.election_file) except etree.LxmlError as e: print("Fatal Error. XML file could not be parsed. {}".format(e)) return 1 self.register_rules(election_tree) for rule in self.registry.get("tree", []): try: rule.check() except ElectionException as e: self.exception_handler(rule, e) for event, element in etree.iterwalk(election_tree, events=("end", )): tag = self.get_element_class(element) if not tag or tag not in self.registry.keys(): continue for element_rule in self.registry[tag]: try: element_rule.check(element) except ElectionException as e: self.exception_handler(element_rule, e) if self.total_count == 0: return 0 else: return 1
def _find_tag_limits(root): START_RE = re.compile(r' __START_(\w+)__ ') END_RE = re.compile(r' __END_(\w+)__ ') starts = list() ends = list() for _, element in etree.iterwalk(root, events=('start',)): tasks = [(element.text, START_RE, starts, False), (element.text, END_RE, ends, False), (element.tail, START_RE, starts, True), (element.tail, END_RE, ends, True)] for text, regexp, storage, is_tail in tasks: if not text: continue for match in regexp.finditer(text): if not match: continue storage.append(_TagPosition(element=element, tag=match.group(1), position=match.start(), length=match.end() - match.start(), is_tail=is_tail, dfs_number=-1)) return starts, ends
def html_to_template_text(unsafe_html): """ Parse html and turn it into template text. """ safe_html = sanitize_intermediate(unsafe_html) top_level_elements = fragments_fromstring(safe_html) # put top level elements in container container = etree.Element('div') if top_level_elements and not hasattr(top_level_elements[0], 'tag'): container.text = top_level_elements.pop(0) container.extend(top_level_elements) context = etree.iterwalk(container, events=('end', )) # walk over all elements for action, elem in context: if not elem.tag in tag_handlers: continue for handler in tag_handlers[elem.tag]: can_continue = handler(elem) if can_continue is False: break template_bits = [ etree.tostring(elem, encoding='utf-8') for elem in container ] return sanitize_final(''.join(tag_imports + [container.text or ''] + template_bits))
def extract_rich_text_from_node(element, url): strs = [] for action, node in etree.iterwalk(element, events=("start", "end")): if not isinstance(node.tag, basestring): continue if action == "start": if node.tag == "br": # new line strs.append("\n") elif node.tag == "img": # image # 抽取图片 new_img_url = extract_image_from_node(node, url) if new_img_url != None: strs.append("\n") strs.append('<img src="%s"/>' %new_img_url) strs.append("\n") if node.tag != "script" and node.tag != "style" and node.text != None: strs.append(node.text) if action == "end": if node.tag in TITLE_PARAGRAPH_TAGS: # a new paragraph strs.append("\n") if node.tag == "td": strs.append(" ") if node.tag == "tr": strs.append("\n") if node.tail != None and len(node.tail.strip()) > 0: strs.append(node.tail) return strs
def to_wkt_list(doc): '''converts all geometries to Well Know Text format''' from lxml import etree def ring_coords_to_wkt(ring): '''converts LinearRing coordinates to WKT style coordinates''' return ((ring.coordinates.text.strip()).replace(' ', '@@').replace( ',', ' ').replace('@@', ', ')) ring_wkt_list = [] context = etree.iterwalk(doc, events=("start", )) for action, elem in context: if elem.tag in [ '{http://www.opengis.net/kml/2.2}Polygon', '{http://www.opengis.net/kml/2.2}MultiPolygon' ]: #print("%s: %s" % (action, elem.tag)) if elem.tag == '{http://www.opengis.net/kml/2.2}Polygon': # outer boundary ringlist = [ '({0})'.format( ring_coords_to_wkt(elem.outerBoundaryIs.LinearRing)) ] for obj in elem.findall( '{http://www.opengis.net/kml/2.2}innerBoundaryIs'): ringlist.append('({0})'.format( ring_coords_to_wkt(obj.LinearRing))) wkt = 'POLYGON ({rings})'.format(rings=', '.join(ringlist)) ring_wkt_list.append(wkt) return (ring_wkt_list)
def _merge_previous(snippet, hooks, elem, children): # snippet, an original preprocessed "piece of template puzzle", # has some of its subelements substituted as per hooks that # together with elem traversal and children dict decides which # parts (of previously proceeded symbols) will be grabbed scheduled = OrderedDict() # XXX to keep the law and order for _, c_elem in etree.iterwalk(elem, events=('start',)): if c_elem is elem: continue if c_elem in children: c_up = c_elem while not c_up.tag in hooks and c_up.getparent() != elem: c_up = c_up.getparent() target_tag = c_up.tag if c_up.tag in hooks else '*' if c_up.tag in hooks or '*' in hooks: for h in hooks[target_tag]: l = scheduled.setdefault(h, []) l.append(children[c_elem].getroot()) for (index_history, mix), substitutes in iter_items(scheduled): tag = reduce(lambda x, y: x[y], index_history, snippet) parent = tag.getparent() index = parent.index(tag) for s in substitutes: #assert s.tag == namespaced(CLUFTER_NS, 'snippet') log.debug("before extension: {0}".format(etree.tostring(s))) if s.tag == namespaced(CLUFTER_NS, 'snippet'): # only single root "detached" supported (first == last) dst = parent # cannot use dict.update(dict) because of losing order for k in s.attrib: dst.attrib[k] = s.attrib[k] #dst[index:index] = s tag.extend(s) elif mix: tag.extend(s) else: # required by obfuscate tag.append(s) log.debug("as extended contains: {0}".format(etree.tostring(tag))) at = tag.attrib.get('at', '*') if mix == 1 and at != '*': #and elem.getparent() is None: e = nselem(XSL_NS, 'apply-templates', select=".//{0}".format(at)) tag.append(e) elif mix == 2: e = nselem(XSL_NS, 'copy') e.append(nselem(XSL_NS, 'apply-templates', select="@*|node()")) tag.append(e) cl = snippet.xpath("//clufter:descent|//clufter:descent-mix", namespaces={'clufter': CLUFTER_NS}) # remove these remnants so cleanup_namespaces works well for e in cl: parent = e.getparent() index = parent.index(e) parent[index:index] = e.getchildren() e.getparent().remove(e)
def getListOfElementsByTagName(self, name): listOfElements = list() for _, elem in etree.iterwalk(self.xmlTree, events=('end', )): if (name == BhpDocumentParser.stripNSFromTagName(elem.tag)): listOfElements.append(elem) if (len(listOfElements) != 0): return listOfElements return None
def extract_exhibit_codes(self): codes = [] for event, element in etree.iterwalk(self.xml_tree()): if element.tag == 'exhibitDocPros': codes.append('Prosecution {}'.format(element.get('n'))) elif element.tag == 'exhibitDocDef': codes.append('{} {}'.format(element.get('def') or 'Unknown Defendant', element.get('n'))) return codes
def parse_doc(self, field, wrapper=None): """Use for retrieving document-level (as opposed to job-level) tags.""" for event, element in etree.iterwalk(self.doc): if element.tag == field: if wrapper: return wrapper(element.text) else: return element.text
def text(self, page=None, merge_verticals=False): """Get the text items. If `page` is supplied, it should be a Page (as returned by self.pages()) If `merge_verticals` is supplied, the vertical offsets of items will be adjusted such that pages following the first appear immediately after the """ def text_for_page(page): for event, text in etree.iterwalk(page.element, tag='text'): fontid = text.attrib['font'] yield Text(text, page, self.fontspec(text.attrib['font'])) if page is None: if merge_verticals: offset = 0 for event, page in etree.iterwalk(self.tree, tag='page'): page = Page(page) items = list(text_for_page(page)) if len(items) == 0: continue ymin = 9999999999999999 ymax = 0 for item in items: if item.text.strip() == '': continue ymin = min(ymin, item.top) ymax = max(ymax, item.bottom) for item in items: if item.text.strip() == '': continue item.yoffset = offset - ymin yield (item) offset += (ymax - ymin) else: for event, page in etree.iterwalk(self.tree, tag='page'): page = Page(page) for item in text_for_page(page): yield (item) else: for item in text_for_page(page): yield (item)
def text(self, page=None, merge_verticals=False): """Get the text items. If `page` is supplied, it should be a Page (as returned by self.pages()) If `merge_verticals` is supplied, the vertical offsets of items will be adjusted such that pages following the first appear immediately after the """ def text_for_page(page): for event, text in etree.iterwalk(page.element, tag='text'): fontid = text.attrib['font'] yield Text(text, page, self.fontspec(text.attrib['font'])) if page is None: if merge_verticals: offset = 0 for event, page in etree.iterwalk(self.tree, tag='page'): page = Page(page) items = list(text_for_page(page)) if len(items) == 0: continue ymin = 9999999999999999 ymax = 0 for item in items: if item.text.strip() == '': continue ymin = min(ymin, item.top) ymax = max(ymax, item.bottom) for item in items: if item.text.strip() == '': continue item.yoffset = offset - ymin yield(item) offset += (ymax - ymin) else: for event, page in etree.iterwalk(self.tree, tag='page'): page = Page(page) for item in text_for_page(page): yield(item) else: for item in text_for_page(page): yield(item)
def __findFeatures__(self, ifdef): result = "" context = etree.iterwalk(ifdef) for action, elem in context: if action == "end": if elem.tag.split('}')[1] == "name": result = result + elem.text return result
def elements(self): schema_tree = etree.parse(self.schema_file) eligible_elements = [] for event, element in etree.iterwalk(schema_tree): tag = self.strip_schema_ns(element) if tag and tag == "element" and element.get("minOccurs") == "0": eligible_elements.append(element.get("name")) return eligible_elements
def __findFeatures__(self, ifdef): result = "" context = etree.iterwalk(ifdef) for action, elem in context: if action == "end": if elem.tag.split("}")[1] == "name": result = result + elem.text return result
def parseWards(self): # Find the map element with the ID "map" mapNode = self.dom.get_element_by_id('map') for action,elem in etree.iterwalk(mapNode, tag='area'): print "Downloading %s" % elem.get('alt') self.wards.append(elem.get('alt')) CouncillorInfo(elem.get('href'))
def remove_empty(self, tree): # nodes that are recursively empty context = et.iterwalk(tree) for action, node in context: parent = node.getparent() if self._recursively_empty(node): parent.remove(node) return tree
def filter_xml(self, filename): pruned = 0 try: tree = etree.parse(filename) path = [] context = etree.iterwalk(tree, events=('start', 'end')) for action, elem in context: if elem.tag == 'object': if action == 'start': path.append(elem.attrib.get('id')) elif action == 'end': obj_path = '/'.join(path) try: obj = self.dmd.getObjByPath(obj_path) if getattr(obj, 'zpl_managed', False): self.LOG.debug("Removing {} from {}".format( obj_path, filename)) pruned += 1 # if there's a comment before it with the # primary path of the object, remove that first. prev = elem.getprevious() if '<!-- ' + repr( tuple('/'.join(path).split( '/'))) + ' -->' == repr(prev): elem.getparent().remove(prev) # Remove the ZPL-managed object elem.getparent().remove(elem) except Exception: self.LOG.warning( "Unable to postprocess {} in {}".format( obj_path, filename)) path.pop() if elem.tag == 'tomanycont': if action == 'start': path.append(elem.attrib.get('id')) elif action == 'end': path.pop() if len(tree.getroot()) == 0: self.LOG.info("Removing {}".format(filename)) os.remove(filename) elif pruned: self.LOG.info("Pruning {} objects from {}".format( pruned, filename)) with open(filename, 'w') as f: f.write(etree.tostring(tree)) else: self.LOG.debug("Leaving {} unchanged".format(filename)) except Exception, e: self.LOG.error("Unable to postprocess {}: {}".format(filename, e))
def remove_unlikely_nodes(self): remove_list = [] context = iterwalk(self.html) for action, elem in context: s = "%s%s" % (elem.get('class', ''), elem.get('id', '')) if REGEXPS['unlikelyNodes'].search(s) and (not REGEXPS['okMaybeItsANode'].search(s)) and elem.tag != 'body': logger.debug("Removing unlikely node - %s" % s) remove_list.append(elem) [e.drop_tree() for e in remove_list if e.tag != 'html']
def __init__(self, fd): self.fontspecs = {} self.tree = etree.parse(fd, etree.HTMLParser(encoding='utf8')) for event, spec in etree.iterwalk(self.tree, tag='fontspec'): atts = spec.attrib fontid = int(atts['id']) assert fontid not in self.fontspecs self.fontspecs[fontid] = FontSpec(atts, fontid)
def ViewParsePage(page): childs = [] iw= etree.iterwalk(page,events=('start','end','start-ns','end-ns'), tag = ('field','delimiter')) for event,element in iw: if event == 'start': p = element.getparent() if p is not None or p and p.tag == page.tag: childs.append(_TAGPARSERS[element.tag](element)) return ['page', dict(page.items()),page.text,childs]
def _fast_iter_episode(self, sele, function): context = etree.iterwalk(sele, events=('end',), tag='episode') for event, elem in context: function(elem) #Clear memory elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] del context
def elements(self): schema_tree = etree.parse(self.schema_file) eligible_elements = [] for event, element in etree.iterwalk(schema_tree): tag = self.strip_schema_ns(element) if (tag and tag == "element" and element.get("type") in ("xs:IDREF", "xs:IDREFS")): eligible_elements.append(element.get("name")) return eligible_elements
def ViewParseViews(view): childs = [] iw= etree.iterwalk(view,events=('start','end','start-ns','end-ns'), tag = _ELTAGS) for event,element in iw: if event == 'start': p = element.getparent() if p is not None and p.tag == view.tag: childs.append(_TAGPARSERS[element.tag](element)) return [view.tag,dict(view.items()),view.text,childs]
def remove_unlikely_candidates(self): context = iterwalk(self.html) for action,elem in context: s = "%s%s" % (elem.get('class', ''), elem.get('id', '')) self.debug(s) if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag != 'body': self.debug("Removing unlikely candidate - %s" % (s,)) elem.drop_tree()
def _match_one(tree, nodes, selector): elements = [] prefix = selector[0] sc = selector[1:] #find by attr if prefix == ".": parts = sc.split("-") if parts: cat = parts[0] value = "-".join(parts[1:]) else: cat = "class", value = sc selector = re.compile(value) for node in nodes: v = node.attrib.get(cat, "") if selector.findall(v): elements.append(node) #find by line number elif prefix == ">": if "-" in selector: smin, smax = [int(x) for x in sc.split("-")] else: smin = smax = int(sc) for _, node in etree.iterwalk(tree, tag="*", events=("start", )): line = node.sourceline if line >= smin and line <= smax: elements.append(node) #find by text elif prefix == "-": for _, node in etree.iterwalk(tree, tag="*", events=("start", )): if node.text and re.findall(sc.decode("utf-8"), node.text, re.UNICODE): elements.append(node) #find by xpath elif prefix == ",": elements.extend(tree.xpath(sc)) #find by tag else: for node in nodes: if selector == node.tag: elements.append(node) return elements
def elements(self): schema_tree = etree.parse(self.schema_file) eligible_elements = [] for event, element in etree.iterwalk(schema_tree): tag = self.strip_schema_ns(element) if tag == "element": elem_type = element.get("type", None) if elem_type and elem_type == "InternationalizedText": if element.get("name") not in eligible_elements: eligible_elements.append(element.get("name")) return eligible_elements
def verify(collection, urn, current_wikipages_list, force_update=False): """Updates a urn's indexing info and returns the set of its recursive links. `collection`: the mongo collection to use as returned by ``get_indexing_mongo_database()``. `urn`: the urn to update the index for, starting with "urn:". `wikipages_url_list` is the sorted list of urls pointing to `urn`. `force_update`: set to True to update the index even if `urn` is already in the index (defaults to ``False``). """ if not force_update: q = collection.find_one({"urn": urn}, {"recursive_links": 1}) if q: try: return set(q["recursive_links"]) except KeyError: return set() resource_database = get_resource_database() try: tree = resource_database.get_xml_tree(urn) except UnexpectedHeader: # it must be a blob perform_upsert(collection, urn, {"fqn": None}) return set() links = set() for event, element in etree.iterwalk(tree): if '{http://www.w3.org/1999/xlink}href' in element.attrib and element.getparent().tag != '{http://ductus.us/ns/2009/ductus}parents': link = element.attrib['{http://www.w3.org/1999/xlink}href'] if link.startswith('urn:%s:' % hash_name): links.add(link) recursive_links = set(links) for link in links: additional_links = verify(collection, link, []) recursive_links.update(additional_links) resource = resource_database.get_resource_object(urn) assert resource.fqn is not None obj = { "fqn": resource.fqn, "links": list(links), "recursive_links": sorted(recursive_links), "current_wikipages": sorted(current_wikipages_list), } try: obj["parents"] = sorted([parent.href for parent in resource.common.parents]) obj["tags"] = sorted([tag.value for tag in resource.tags]) except AttributeError: pass perform_upsert(collection, urn, obj) return recursive_links
def query_element(tree, selector): elements = [] nodes = [] for _, node in etree.iterwalk(tree, tag="*", events=("start", )): nodes.append(node) if selector.startswith("["): for x in selector.strip("[").strip("]").split(","): elements.extend(_match_one(tree, nodes, x.strip())) else: elements.extend(_match_one(tree, nodes, selector)) return elements
def enumerate_paths(element, prefixes): added = set() def add(key): added.add(key) indices = {} star_index = 0 context = etree.iterwalk(element, events=("start", "end")) it = iter(context) the_element = it.next() for (k, v) in the_element[1].attrib.items(): add(("parent-attr", k)) skip = 0 paths = [] has_children = False for action, elem in it: log("%s %s", action, elem) if action == "start": skip += 1 if skip == 1: has_children = False star_index += 1 paths.append(("*", star_index, None)) namespace, tag = lxml_tag(elem.tag) log("namespace=%r prefixes=%r", namespace, prefixes) prefix = prefixes[namespace] el = path_element((prefix, tag)) indices.setdefault(el, 0) indices[el] += 1 paths.append((el, None, None)) paths.append((el, indices[el], None)) for (k, v) in elem.attrib.items(): paths.append((el, None, (k, v))) else: has_children = True elif action == "end": if skip == 1: for p in paths: el, position, att_test = p add(("element", el, position, att_test, has_children)) paths = [] skip -= 1 log("indices=%r", indices) for x in added: log("x=%r", x) return added, indices, star_index
def getConfig(self,configstr): self.methodname="getConfig(self,"+configstr+")" self.configstr = configstr self.configarr = [] self.configvar = '' log = logger("xml_config_parser") if self.verbose: log.debug("launched "+self.methodname+" of "+self.classname+".") if self.configstr == "excluded_interfaces": #for self.configtag in self.abyle_config.getElementsByTagName("interface"): configwalk = etree.iterwalk(self.abyle_config,events=("start","end"),tag=str("interface")) for action, elem, in configwalk: if action in ('start'): attributes = elem.attrib if str(attributes.get("excluded")).upper() == "YES": if self.verbose: log.debug(elem.text) self.configarr.append(elem.text) return self.configarr else: #for self.configtag in self.abyle_config.getElementsByTagName(self.configstr): configwalk = etree.iterwalk(self.abyle_config,events=("start","end"),tag=str(self.configstr)) for action, elem, in configwalk: if action in ('start'): if self.verbose: log.debug(elem.text) self.configarr.append(elem.text) try: self.configvar = self.configarr[1] return self.configarr except (IndexError): return self.configarr[0]
def print_policy(self, tree): policy_out = {} logical_elements = ["and-match","or-match"] comparison_elements = ["attribute-match"] criteria_walk = etree.iterwalk(tree, events=("start","end")) criteria_walk.next() logic_stack = [] obj_policy_set = [] policy_set = [] top = "and" for action, element in criteria_walk: if action == "start": if element.tag == "object-policy": policy_clause = "This experiment may %s a %s if" % (element.get("allow"), tree.get("for")) sub_count = 0 elif element.tag == "attribute-policy": policy_clause = "This experiment may %s a %s of a %s if" % (element.get("allow"), element.getparent().get("type"), tree.get("for")) sub_count = 0 elif element.tag in logical_elements: if "and" in element.tag: top = "and" elif "or" in element.tag: top = "or" logic_stack.push(top) elif element.tag in comparison_elements: policy_clause = "%s its %s matches %s %s" % (policy_clause, element.get("match"), element.get("on_object"), top) sub_count += 1 elif action == "end": if element.tag == "object-criteria": policy_clause = policy_clause.rsplit(" ",1)[0] obj_policy_set.append(policy_clause) elif element.tag == "attribute-criteria": policy_clause = policy_clause.rsplit(" ",1)[0] obj_policy_set.append(policy_clause) elif (element.tag == "object-policy" and sub_count == 0): policy_clause = policy_clause.rsplit(" ",1)[0] obj_policy_set.append(policy_clause) elif(element.tag == "attribute-policy" and sub_count == 0): policy_clause = policy_clause.rsplit(" ",1)[0] obj_policy_set.append(policy_clause) elif element.tag in logical_elements: logic_stack.pop() return obj_policy_set
def __init__(self, election_tree, schema_file): super(ValidIDREF, self).__init__(election_tree, schema_file) for event, element in etree.iterwalk( self.election_tree, events=("end",)): if "objectId" not in element.attrib: continue else: obj_id = element.get("objectId") if not obj_id: continue else: self.all_object_ids.add(obj_id)