def validate_xml(fname): schema_file = "https://www.linutronix.de/projects/Elbe/dbsfed.xsd" parser = XMLParser(huge_tree=True) schema_tree = etree.parse(schema_file) schema = etree.XMLSchema(schema_tree) try: xml = parse(fname, parser=parser) if schema.validate(xml): return [] except etree.XMLSyntaxError: return ["XML Parse error\n" + str(sys.exc_info()[1])] except: return [ "Unknown Exception during validation\n" + str(sys.exc_info()[1]) ] # We have errors, return them in string form... errors = [] for err in schema.error_log: errors.append("%s:%d error %s" % (err.filename, err.line, err.message)) return errors
def write_xml(self, fname, template_fname, offset=np.array([0, 0, 0]), ref_angles=None): if ref_angles is None: ref_angles = {} parser = XMLParser(remove_blank_text=True) tree = parse(template_fname, parser=parser) worldbody = tree.getroot().find('worldbody') self.write_xml_bodynode(self.root, worldbody, offset, ref_angles) # create actuators actuators = tree.getroot().find('actuator') joints = worldbody.findall('.//joint') for joint in joints[1:]: name = joint.attrib['name'] attr = dict() attr['name'] = name attr['joint'] = name attr['gear'] = '1' SubElement(actuators, 'motor', attr) tree.write(fname, pretty_print=True)
def get(self, filename_wo_suffix, recipe_type): """ Find other versions. :param filename_wo_suffix: :param recipe_type: :return: list of dicts with version as key and filepath as value """ original_file_path_wo_suffix = safe_join(current_app.config['catalog-directory'], recipe_type, filename_wo_suffix) parser = XMLParser(remove_blank_text=True) try: actual_doc = parse(original_file_path_wo_suffix + '.xml', parser=parser) except IOError as e: current_app.logger.error(e) return [] versions = self.xpath_get_version(actual_doc) actual_version = '' actual_version_len = 0 if versions: actual_version = versions[0] actual_version_len = len(actual_version) + 1 filename_wo_version = filename_wo_suffix[:-actual_version_len] return_list = [] if actual_version: for file_path in glob(safe_join(current_app.config['catalog-directory'], recipe_type, filename_wo_version) + '-*.xml'): try: doc = parse(file_path, parser=parser) if self.xpath_has_other_versions(doc, version=actual_version, filename_wo_version=filename_wo_version): path_list = file_path.split('/') return_dict = {self.xpath_get_version(doc)[0]: '/' + path_list[-2] + '/' + path_list[-1]} return_list.append(return_dict) except XMLSyntaxError as e: current_app.logger.warning('Syntax error in catalog file "%s": \n%s', file_path, e) return return_list
def to_xml(text, encoding): from xml.etree.ElementTree import fromstring, ParseError processed = text.lstrip(BOM).encode(encoding or 'utf-8') try: return fromstring(processed) except ParseError: from io import StringIO from lxml.etree import XMLParser, parse, tostring # Exchange servers may spit out the weirdest XML. lxml is pretty good at recovering from errors log.warning('Fallback to lxml processing of faulty XML') magical_parser = XMLParser(encoding=encoding or 'utf-8', recover=True) root = parse(StringIO(processed), magical_parser) try: return fromstring(tostring(root)) except ParseError as e: line_no, col_no = e.lineno, e.offset try: offending_line = processed.splitlines()[line_no - 1] except IndexError: offending_line = '' offending_excerpt = offending_line[max(0, col_no - 20):col_no + 20].decode('ascii', 'ignore') raise ParseError('%s\nOffending text: [...]%s[...]' % (str(e), offending_excerpt)) from e
def getAnimeInformation(animeIds: List[int], maxBatchSize: int = 50, waitTime: float = 1): utf8Parser: XMLParser = XMLParser(encoding="utf-8") totalBatches = len(animeIds) // maxBatchSize baseGetByTitleIdURL = "https://cdn.animenewsnetwork.com/encyclopedia/api.xml?title=" elements = [] problems = [] for index, animeIdsBatch in enumerate(grouper(animeIds, maxBatchSize)): print(index + 1, "of", totalBatches) # Remove nones animeIdsBatch = [animeRecordId for animeRecordId in animeIdsBatch if animeRecordId] batchRequestURL = baseGetByTitleIdURL + "/".join(animeIdsBatch) request = requests.get(batchRequestURL) try: requestXMLString = request.content.decode(encoding="utf-8") requestXMLRoot = etree.fromstring(requestXMLString, utf8Parser) for element in requestXMLRoot: elements.append(element) except XMLSyntaxError: print("Error at", index + 1) for batchIndex, animeId in enumerate(animeIdsBatch): print(batchIndex, "of", maxBatchSize, "in batch", index) batchRequestURL = baseGetByTitleIdURL + str(animeId) request = requests.get(batchRequestURL) requestXMLString = request.content.decode(encoding="utf-8") try: requestXMLRoot = etree.fromstring(requestXMLString, utf8Parser) for element in requestXMLRoot: elements.append(element) except XMLSyntaxError: print("problem at", batchIndex, "of", maxBatchSize, "in batch", index) problems.append(requestXMLString) time.sleep(waitTime) time.sleep(waitTime) return elements, problems
import numpy as np import arrayConfigurationTools as act import os from optparse import OptionParser #aaguirre from wsgiref.simple_server import make_server from pyramid.config import Configurator from pyramid.response import Response from pyramid.response import FileResponse from asdf.utils import is_valid_uid from pyramid.httpexceptions import HTTPBadRequest p = XMLParser(huge_tree=True) ALMA_LONGITUDE = -67.754748 # ICT-4143, -67.754694=JPL-Horizons, -67.754929=CASA observatories ALMA_LATITUDE = -23.029211 # ICT-4143, -23.029167=JPL-Horizons, -23.022886=CASA observatories BB_NAME = ['BB_1', 'BB_2', 'BB_3', 'BB_4'] def readOptions(): "Read the CL options ..." parser = OptionParser() parser.add_option("-u", "--uid", dest="uid", default='none',
def create_lxml_context(): parser = XMLParser(no_network=True) parser.set_element_class_lookup( ElementDefaultClassLookup(element=Element, comment=Comment)) return parser
from functools import partial from openpyxl import DEFUSEDXML, LXML if LXML is True: from lxml.etree import ( Element, SubElement, register_namespace, QName, xmlfile, XMLParser, ) from lxml.etree import fromstring, tostring # do not resolve entities safe_parser = XMLParser(resolve_entities=False) fromstring = partial(fromstring, parser=safe_parser) else: from xml.etree.ElementTree import (Element, SubElement, fromstring, tostring, QName, register_namespace) from et_xmlfile import xmlfile if DEFUSEDXML is True: from defusedxml.ElementTree import fromstring from xml.etree.ElementTree import iterparse if DEFUSEDXML is True: from defusedxml.ElementTree import iterparse from openpyxl.xml.constants import (CHART_NS, DRAWING_NS, SHEET_DRAWING_NS, CHART_DRAWING_NS, SHEET_MAIN_NS, REL_NS,
def annotate(corePath): """Generate an annotated Space Haven library""" texture_names = {} local_texture_names = ElementTree.parse("textures_annotations.xml", parser=XMLParser(recover=True)) for region in local_texture_names.findall(".//re[@n]"): if not region.get("_annotation"): continue texture_names[region.get('n')] = region.get("_annotation") animations = ElementTree.parse(os.path.join(corePath, "library", "animations"), parser=XMLParser(recover=True)) for assetPos in animations.findall('.//assetPos[@a]'): asset_id = assetPos.get('a') if not asset_id in texture_names: continue assetPos.set('_annotation', texture_names[asset_id]) annotatedPath = os.path.join(corePath, "library", "animations_annotated.xml") animations.write(annotatedPath) ui.log.log(" Wrote annotated annimations to {}".format(annotatedPath)) haven = ElementTree.parse(os.path.join(corePath, "library", "haven"), parser=XMLParser(recover=True)) texts = ElementTree.parse(os.path.join(corePath, "library", "texts"), parser=XMLParser(recover=True)) tids = {} # Load texts for text in texts.getroot(): tids[text.get("id")] = text.find("EN").text def nameOf(element): name = element.find("name") if name is None: return "" tid = name.get("tid") if tid is None: return "" return tids[tid] ElementRoot = haven.find("Element") # Annotate Elements for element in ElementRoot: mid = element.get("mid") objectInfo = element.find("objectInfo") if objectInfo is not None: element.set("_annotation", nameOf(objectInfo)) # Annotate basic products # first pass also builds the names cache elementNames = {} ProductRoot = haven.find("Product") for element in ProductRoot: name = nameOf(element) or element.get("elementType") or "" if name: element.set("_annotation", name) elementNames[element.get("eid")] = name for item in haven.find("Item"): name = nameOf(item) or item.get("elementType") or "" if name: item.set("_annotation", name) elementNames[item.get("mid")] = name # small helped to annotate a node def _annotate_elt(element, attr=None): if attr: name = elementNames[element.get(attr)] else: name = elementNames[element.get("element", element.get("elementId"))] if name: element.set("_annotation", name) return name # construction blocks for the build menu for me in ElementRoot: for customPrice in me.findall(".//customPrice"): for sub_l in customPrice: _annotate_elt(sub_l) # Annotate facility processes, now that we know the names of all the products involved for element in ProductRoot: processName = [] for need in element.xpath("needs/l"): name = _annotate_elt(need) processName.append(name) processName.append("to") for product in element.xpath("products/l"): name = _annotate_elt(product) processName.append(name) if len(processName) > 2 and not element.get("_annotation"): processName = " ".join(processName) elementNames[element.get("eid")] = processName element.set("_annotation", processName) #generic rule should work for all remaining nodes ? for sub_element in haven.findall(".//*[@consumeEvery]"): try: _annotate_elt(sub_element) except: pass # error on 446, weird stuff #print(sub_element.tag) #print(sub_element.attrib) # iterate again once we have built all the process names for process in ProductRoot.xpath('.//list/processes/l[@process]'): process.set("_annotation", elementNames[process.get("process")]) for trade in haven.find('TradingValues').findall('.//t'): try: _annotate_elt(trade, attr='eid') except: pass DifficultySettings = haven.find('DifficultySettings') for settings in DifficultySettings: name = nameOf(settings) if name: settings.set("_annotation", name) for res in DifficultySettings.xpath('.//l'): try: _annotate_elt(res, attr='elementId') except: pass for res in DifficultySettings.xpath('.//rules/r'): try: _annotate_elt(res, attr='cat') except: pass annotatedHavenPath = os.path.join(corePath, "library", "haven_annotated.xml") haven.write(annotatedHavenPath) ui.log.log(" Wrote annotated spacehaven library to {}".format( annotatedHavenPath))
def retrieve_doc(self, doc_id): self.configure(doc_id) raw_doc = None tree = self.xml_parser_cache.get(self.doc_path) if self.acquaint: if tree is None: parser = etree.HTMLParser(encoding='utf-8', remove_blank_text=True) with open(self.doc_path) as file: tree = html.fragment_fromstring(file.read(), create_parent='body', parser=parser) raw_doc = [ element for element in tree.findall("doc") if element.find("docno").text == " " + doc_id + " " ][0] raw_doc.getparent().remove( raw_doc ) # Removes previous accessed raw document from tree to save memory in cache elif self.acquaint2: if tree is None: tree = etree.parse(self.doc_path) raw_doc = [ element for element in tree.findall("DOC") if element.get("id") == doc_id ][0] #### Must not remove document from tree because documents repeat under different topics elif self.gigaword: if tree is None: p = XMLParser( huge_tree=True ) #### Some files are too large, without this they prevent parsing with gzip.open(self.doc_path, 'rt', encoding='latin-1') as file: data = file.read() #### In this one file there's a less than symbold that prevents parsing if self.doc_path == '/corpora/LDC/LDC11T07/data/xin_eng/xin_eng_200811.gz': data = data.replace('<3', 'lt 3') ### Replaces the < with lt tree = etree.fromstring('<DOCSTREAM>\n' + data.strip() + '\n</DOCSTREAM>\n', parser=p) raw_doc = [ element for element in tree.findall("DOC") if element.get("id") == doc_id ][0] raw_doc.getparent().remove( raw_doc ) # Removes previous accessed raw document from tree to save memory in cache self.xml_parser_cache.update({self.doc_path: tree}) return raw_doc
def setup(self, xml): """ sets the parrameters up from xml :param xml: datasource parameters :type xml: :obj:`str` """ if sys.version_info > (3, ): xml = bytes(xml, "UTF-8") root = et.fromstring(xml, parser=XMLParser(collect_ids=False)) rec = root.find("record") name = None if rec is not None: name = rec.get("name") if not name: if self._streams: self._streams.error("TangoSource::setup() - " "Tango record name not defined: %s" % xml, std=False) raise DataSourceSetupError("Tango record name not defined: %s" % xml) dv = root.find("device") device = None client = False if dv is not None: device = dv.get("name") hostname = dv.get("hostname") port = dv.get("port") group = dv.get("group") encoding = dv.get("encoding") memberType = dv.get("member") if not memberType or memberType not in [ "attribute", "command", "property" ]: memberType = "attribute" if group != '__CLIENT__': self.group = group else: client = True self.member = TgMember(name, memberType, encoding, streams=self._streams) if not device: if self._streams: self._streams.error("TangoSource::setup() - " "Tango device name not defined: %s" % xml, std=False) raise DataSourceSetupError("Tango device name not defined: %s" % xml) if sys.version_info > (3, ): ehostname = hostname eport = port edevice = device else: ehostname = hostname.encode() if hostname else hostname eport = port.encode() if port else port edevice = device.encode() if device else device if hostname and port and device: self.device = "%s:%s/%s" % (ehostname, eport, edevice) elif device: self.device = "%s" % (edevice) self.__proxy = ProxyTools.proxySetup(self.device, streams=self._streams) if not self.__proxy: if self._streams: self._streams.error("TangoSource::setup() - " "Cannot connect to: %s \ndefined by %s" % (self.device, xml), std=False) raise DataSourceSetupError( "Cannot connect to: %s \ndefined by %s" % (self.device, xml)) if hostname and port and device and client: try: host = self.__proxy.get_db_host().split(".")[0] except Exception: host = ehostname.split(".")[0] self.client = "%s:%s/%s/%s" % (host, eport, edevice, name.lower()) self.fullclient = "%s:%s/%s/%s" % (socket.getfqdn(host), eport, edevice, name.lower())
def _load(self): self._root = parse(self.name_page, XMLParser(remove_blank_text=True)).getroot() self._text = self._root[0] self._keys = self._root[1]
node_tracker[node] = [node_tracker[parent][0] + 1, parent] node_tracker = sorted([(depth, parent, child) for child, (depth, parent) in node_tracker.items()], key=lambda x: x[0], reverse=True) for _, parent, child in node_tracker: if parent is None: break parent.remove(child) del tree large_parser = XMLParser(huge_tree=True) parser = etree.XMLParser(remove_blank_text=True) if __name__ == "__main__": output = dict() with open("iati_schema_xpaths.txt") as xpath_txt_file: xpaths = xpath_txt_file.read().splitlines() xml_file = "input.xml" try: tree = etree.parse(xml_file, parser=large_parser) except etree.XMLSyntaxError: pass root = tree.getroot()
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from os import listdir, makedirs from os.path import join from lxml.etree import (PythonElementClassLookup, XMLParser, tostring, fromstring, CommentBase, Comment, Element, SubElement, parse, ParseError, ElementTree, CustomElementClassLookup) from .exceptions import MissingFileError, ParserError, TagNotFound module_parser = XMLParser(remove_pis=True, remove_blank_text=True) class _CommentLookup(CustomElementClassLookup): def lookup(self, elem_type, doc, namespace, name): from .nodes import NodeComment if elem_type == "comment": return NodeComment else: return None class _NodeClassLookup(PythonElementClassLookup): """ Class that handles the custom lookup for the element factories.
import logging import time from lxml import objectify from lxml.etree import XMLParser from suds import WebFault, TypeNotFound from suds.client import Client as sudsClient from suds.plugin import MessagePlugin from suds.xsd.query import TypeQuery, ElementQuery parser = XMLParser(remove_blank_text=True, huge_tree=True) parser.set_element_class_lookup(objectify.ObjectifyElementClassLookup()) objectify.set_default_parser(parser) logger = logging.getLogger('suds.client.lxml') logging.getLogger('suds.client').setLevel( logging.CRITICAL) # Don't show suds messages! class SoapObject: def __init__(self, name): self.__name__ = name def __len__(self): return len(self.__dict__.items()) - 1 # ignore the __name__ property def __repr__(self): return self.__str__() def __str__(self): return self.__name__
def load_xml_from_text(text): return fromstring(text, parser=XMLParser(strip_cdata=False))
type=int, help= "interval (in minutes) under which two messages are considered to be from the same discussion" ) parser.add_argument( "--tokenize", dest="tokenize", default=False, action="store_true", help= "tokenize the output string. Activate only if it isn't already done by your Word2Vec" ) args = parser.parse_args() lxml_parser = XMLParser(huge_tree=True, recover=True) payload = open(args.input_file, "rb").read().decode("utf-8") payload = payload.encode('utf-16', 'surrogatepass').decode('utf-16') payload = payload.encode('utf-8') tree = parse(BytesIO(payload), parser=lxml_parser) root = tree.getroot() messages = {} for msg in root.getchildren(): msgdata = dict(zip(msg.keys(), msg.values())) num = msgdata['address'] msgdata = {
def setup(self, xml): """ sets the parrameters up from xml :param xml: datasource parameters :type xml: :obj:`str` """ if sys.version_info > (3,): xml = bytes(xml, "UTF-8") root = et.fromstring(xml, parser=XMLParser(collect_ids=False)) mds = root.find("datasource") inputs = [] if mds is not None: inputs = root.findall(".//datasource") for inp in inputs: if "name" in inp.attrib and "type" in inp.attrib: name = inp.get("name") dstype = inp.get("type") if len(name) > 0: if len(name) > 3 and name[:2] == 'ds.': name = name[3:] self.__sources[name] = (dstype, self._toxml(inp)) else: if self._streams: self._streams.error( "PyEvalSource::setup() - " "PyEval input %s not defined" % name, std=False) raise DataSourceSetupError( "PyEvalSource::setup() - " "PyEval input %s not defined" % name) else: if self._streams: self._streams.error( "PyEvalSource::setup() - " "PyEval input name wrongly defined", std=False) raise DataSourceSetupError( "PyEvalSource::setup() - " "PyEval input name wrongly defined") res = root.find("result") if res is not None: self.__name = res.get("name") or 'result' if len(self.__name) > 3 and self.__name[:2] == 'ds.': self.__name = self.__name[3:] self.__script = self._getText(res) if len(self.__script) == 0: if self._streams: self._streams.error( "PyEvalSource::setup() - " "PyEval script %s not defined" % self.__name, std=False) raise DataSourceSetupError( "PyEvalSource::setup() - " "PyEval script %s not defined" % self.__name) if "commonblock" in self.__script: self.__commonblock = True else: self.__commonblock = False
PATH="Y:\\OutputReports\\mitm_attack" if __name__ == '__main__': dirs = os.listdir(PATH) count = 0 init(autoreset=True) # Print heading line print("experiment_id;Outcome;AttackMethod;TrapUrl;RequestHeaders;RequestContents;Elevated") with open("mitm_res.csv", "wt") as f: f.write("Experiment Id;Outcome;AttackMethod;TrapUrl;RequestHeaders;RequestContents;Elevated\n") for d in dirs: mitm_file = os.path.join(PATH,d,"mitm.xml") parser = XMLParser(ns_clean=True, recover = True) xml = ET.parse(mitm_file,parser) # Retrieve info about the MITM Attack mitm = xml.findall(".//MitmAttack")[0] success = mitm.find("Success").text pid = mitm.find("ProcessId").text elevated = mitm.find("ProcessElevated").text proc_image_path = mitm.find("ProcessPath").text attack_url = None req_headers=None req_contents=None attack_type = None request = mitm.find("NetworkInfo/Flow/Request") # Sometimes we get missing network info. For now just skip that info
XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace' PKL_NAMESPACE = "http://namespaces.zope.org/pickle" OBJ_NAMESPACE = "http://namespaces.zope.org/pyobj" PKL_PREFIX = '{%s}' % PKL_NAMESPACE OBJ_PREFIX = '{%s}' % OBJ_NAMESPACE XML_PREFIX = '{%s}' % XML_NAMESPACE PMAP = {'p': PKL_NAMESPACE} NAMESPACES = {None: PKL_NAMESPACE, 'xml': XML_NAMESPACE} OMAP = {'o': OBJ_NAMESPACE} pprefixlen = len(PKL_PREFIX) base_element_name = 'Pickle' parser = XMLParser(ns_clean=True) class IItemPickler(Interface): def dumps(): """return an XML representation of item""" class IItemUnpickler(Interface): def loads(str): """return an object represented by str""" def string_convert(s): """ if not valid in xml text, convert to base64
def loadJson(self, data, validate=True, postValidate=True, **parserOptions): """Load a PMML model represented as a JSON string, fileName, dict, or file-like object. There is no standard XML-to-JSON specification, so we define our own. Our specification is very similar to U{this proposal<http://www.xml.com/pub/a/2006/05/31/converting-between-xml-and-json.html>}, which collects subelements of different tagnames into different JSON lists, rather than having one long list and needing to specify the tag of each element in that list. This has the following advantages, particularly useful for PMML: - Frequent tagnames (like <Segment>) are not repeated, wasting space. - Subelements with a given tagname can be quickly queried, without having to iterate over a list that contains non-matching tagnames. It has the following disadvantages: - The relative order of subelements with different tagnames is not preserved. We therefore additionally include a JSON attribute named "#" to specify the ordering of subelements in the XML representation. Also, the specification referenced above represents single-child subelements as JSON objects and multiple children as JSON lists, but for consistency and ease of parsing, we always use lists. The last difference is that we include "#tail" as well as "#text", so that text outside of an element is preserved (rarely relevant for PMML, but included for completeness). Note that this method returns a JSON-like dictionary, not a string. To serialize to JSON, use the C{json} module from the Python Standard Library, a faster variant, or an exotic serializer such as BSON. @type data: string, dict, or file-like object @param data: The data to load. @type validate: bool @param validate: If True, validate the resulting PmmlBinding against this ModelLoader's XSD schema after loading. @type postValidate: bool @param postValidate: If True, run post-XSD validation checks. (Note: very few PmmlBinding subclasses have postValidation tests defined as of May 2013.) @param **parserOptions: Arguments passed to lxml's U{XMLParser<http://lxml.de/api/lxml.etree.XMLParser-class.html>}. @rtype: PmmlBinding @return: In-memory PMML object. @raise ValueError: If the JSON text is malformed or does not represent PMML, an error is raised. """ if hasattr(data, "read"): data = json.load(data) elif isinstance(data, basestring): if os.path.exists(data): data = json.load(open(data)) else: data = json.loads(data) if not isinstance(data, dict): raise ValueError("JSON object must be a mapping at the top level") if validate: if self.preparedSchema is None: self.preparedSchema = XMLSchema(self.schema) schema = self.preparedSchema else: schema = None parser = XMLParser(**parserOptions) lookup = ElementNamespaceClassLookup() namespace = lookup.get_namespace(defs.PMML_NAMESPACE) for xsdElement in self.schema.xpath( "xs:element", namespaces={"xs": defs.XSD_NAMESPACE}): namespace[xsdElement.attrib["name"]] = PmmlBinding namespace.update(self.tagToClass) parser.set_element_class_lookup(lookup) try: nsmap = data["#nsmap"] except KeyError: raise ValueError( "JSON object must have a \"#nsmap\" key at the top level") if "" in nsmap: nsmap[None] = nsmap[""] del nsmap[""] del data["#nsmap"] if len(data) != 1: raise ValueError( "JSON object must have exactly one PMML object at the top level" ) tag = data.keys()[0] data = data[tag] if not isinstance(data, list) or len(data) != 1: raise ValueError( "Top-level PMML object must be a list with exactly one item") data = data[0] pmmlBinding = self._loadJsonItem(tag, data, parser, nsmap) if validate: schema.assertValid(pmmlBinding) if postValidate: for event, elem in iterwalk(pmmlBinding, events=("end", ), tag="{%s}*" % defs.PMML_NAMESPACE): if isinstance(elem, PmmlBinding): elem.postValidate() return pmmlBinding
def _make_tree(self, fstring): root = etree.fromstring(fstring, parser=XMLParser( recover=True, encoding=get_encoding(fstring))) return root
import six from lxml import etree from lxml.etree import Element, ElementTree, XMLParser from xblock.core import XML_NAMESPACES from xblock.fields import Dict, Scope, ScopeIds from xblock.runtime import KvsFieldData from xmodule.modulestore import EdxJSONEncoder from xmodule.modulestore.inheritance import InheritanceKeyValueStore, own_metadata from xmodule.x_module import XModuleDescriptor # lint-amnesty, pylint: disable=unused-import log = logging.getLogger(__name__) # assume all XML files are persisted as utf-8. EDX_XML_PARSER = XMLParser(dtd_validation=False, load_dtd=False, remove_comments=True, remove_blank_text=True, encoding='utf-8') def name_to_pathname(name): """ Convert a location name for use in a path: replace ':' with '/'. This allows users of the xml format to organize content into directories """ return name.replace(':', '/') def is_pointer_tag(xml_obj): """ Check if xml_obj is a pointer tag: <blah url_name="something" />.
def recovery_parser(xml): parser = XMLParser(recover=True) return parse(BytesIO(xml), parser)
def xmlpreprocess(fname, output, variants=None, proxy=None): # pylint: disable=too-many-locals # pylint: disable=too-many-branches # first convert variants to a set if not variants: variants = set([]) else: variants = set(variants) schema_file = "https://www.linutronix.de/projects/Elbe/dbsfed.xsd" parser = XMLParser(huge_tree=True) schema_tree = etree.parse(schema_file) schema = etree.XMLSchema(schema_tree) try: xml = parse(fname, parser=parser) xml.xinclude() # Variant management # check all nodes for variant field, and act accordingly. # The result will not contain any variant attributes anymore. rmlist = [] for tag in xml.iter('*'): if 'variant' in tag.attrib: tag_variants = set(tag.attrib['variant'].split(',')) # check if tag_variants intersects with # active variants. intersect = variants.intersection(tag_variants) if intersect: # variant is wanted, keep it and remove the variant # attribute tag.attrib.pop('variant') else: # tag has a variant attribute but the variant was not # specified: remove the tag delayed rmlist.append(tag) for tag in rmlist: tag.getparent().remove(tag) # if there are multiple sections because of sth like '<finetuning # variant='A'> ... and <finetuning variant='B'> and running preprocess # with --variant=A,B the two sections need to be merged # # Use xpath expressions to identify mergeable sections. for mergepath in mergepaths: mergenodes = xml.xpath(mergepath) # if there is just one section of a type # or no section, nothing needs to be done if len(mergenodes) < 2: continue # append all childrens of section[1..n] to section[0] and delete # section[1..n] for section in mergenodes[1:]: for c in section.getchildren(): mergenodes[0].append(c) section.getparent().remove(section) # handle archivedir elements xml = combinearchivedir(xml) preprocess_mirror_replacement(xml) preprocess_proxy_add(xml, proxy) # Change public PGP url key to raw key preprocess_pgp_key(xml) preprocess_iso_option(xml) preprocess_initvm_ports(xml) if schema.validate(xml): # if validation succedes write xml file xml.write( output, encoding="UTF-8", pretty_print=True, compression=9) # the rest of the code is exception and error handling return except etree.XMLSyntaxError: raise XMLPreprocessError("XML Parse error\n" + str(sys.exc_info()[1])) except ArchivedirError: raise XMLPreprocessError("<archivedir> handling failed\n" + str(sys.exc_info()[1])) except BaseException: raise XMLPreprocessError( "Unknown Exception during validation\n" + str(sys.exc_info()[1])) # We have errors, return them in string form... raise XMLPreprocessError("\n".join(error_log_to_strings(schema.error_log)))
def checkfile(modelXbrl, filepath): result = [] lineNum = 1 foundXmlDeclaration = False isEFM = modelXbrl.modelManager.disclosureSystem.validationType == "EFM" file, encoding = modelXbrl.fileSource.file(filepath) parserResults = {} class checkFileType(object): def start(self, tag, attr, nsmap=None): # check root XML element type parserResults["rootIsTestcase"] = tag.rpartition("}")[2] in ("testcases", "documentation", "testSuite", "testcase", "testSet") if tag in ("{http://www.w3.org/1999/xhtml}html", "{http://www.w3.org/1999/xhtml}xhtml"): if nsmap and any(ns in ixbrlAll for ns in nsmap.values()): parserResults["isInline"] = True else: parserResults["maybeInline"] = True def end(self, tag): pass def data(self, data): pass def close(self): pass _parser = XMLParser(target=checkFileType(),huge_tree=True) _isTestcase = False mayBeInline = isInline = False with file as f: while True: line = f.readline() if line == "": break; # check for disallowed characters or entity codes for match in docCheckPattern.finditer(line): text = match.group() if text.startswith("&"): if not text in xhtmlEntities: modelXbrl.error(("EFM.5.02.02.06", "GFM.1.01.02"), _("Disallowed entity code %(text)s in file %(file)s line %(line)s column %(column)s"), modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start()) elif isEFM and not _isTestcase: if len(text) == 1: modelXbrl.error("EFM.5.02.01.01", _("Disallowed character '%(text)s' (%(unicodeIndex)s) in file %(file)s at line %(line)s col %(column)s"), modelDocument=filepath, text=text, unicodeIndex="U+{:04X}".format(ord(text)), file=os.path.basename(filepath), line=lineNum, column=match.start()) else: modelXbrl.error("EFM.5.02.01.01", _("Disallowed character '%(text)s' in file %(file)s at line %(line)s col %(column)s"), modelDocument=filepath, text=text, file=os.path.basename(filepath), line=lineNum, column=match.start()) if lineNum == 1: xmlDeclarationMatch = XMLdeclaration.search(line) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() line = line[0:start] + line[end:] foundXmlDeclaration = True if _parser: # feed line after removal of xml declaration _parser.feed(line.encode('utf-8','ignore')) if "rootIsTestcase" in parserResults: # root XML element has been encountered _isTestcase = parserResults["rootIsTestcase"] if "isInline" in parserResults: isInline = True elif "maybeInline" in parserResults: mayBeInline = True _parser = None # no point to parse past the root element if mayBeInline and inlinePattern.search(line): mayBeInline = False isInline = True if isInline: for match in inlineSelfClosedElementPattern.finditer(line): selfClosedLocalName = match.group(3) if selfClosedLocalName not in elementsWithNoContent: modelXbrl.warning("ixbrl:selfClosedTagWarning", _("Self-closed element \"%(element)s\" may contain text or other elements and should not use self-closing tag syntax (/>) when empty; change these to end-tags in file %(file)s line %(line)s column %(column)s"), modelDocument=filepath, element=match.group(1), file=os.path.basename(filepath), line=lineNum, column=match.start()) result.append(line) lineNum += 1 result = ''.join(result) if not foundXmlDeclaration: # may be multiline, try again xmlDeclarationMatch = XMLdeclaration.search(result) if xmlDeclarationMatch: # remove it for lxml start,end = xmlDeclarationMatch.span() result = result[0:start] + result[end:] foundXmlDeclaration = True return (io.StringIO(initial_value=result), encoding)
def __init__(self): self.logger = LogUtil.get_logger(self.__class__.__name__, "_logs") self.xml_parser = XMLParser(encoding="utf-8", huge_tree=True, ns_clean=True)
def create_new_epg(args, original_epg_filename, m3u_entries): tvg_id_unique_entries = {e.tvg_id.lower(): e for e in m3u_entries}.values() output_str("creating new xml epg for {} m3u items".format( len(tvg_id_unique_entries))) try: xml_parser = XMLParser(recover=True) original_tree = parse(original_epg_filename, xml_parser) original_root = original_tree.getroot() if original_root is None: output_str( "epg creation failure, the supplied source {0} epg file appears to have no root element. Check the source data." .format(original_epg_filename)) return None new_root = Element("tv") new_root.set("source-info-name", "py-m3u-epg-editor") new_root.set("generator-info-name", "py-m3u-epg-editor") new_root.set("generator-info-url", "py-m3u-epg-editor") # create a channel element for every channel present in the m3u epg_channel_count = 0 created_channels = [] for channel in original_root.iter('channel'): channel_id = channel.get("id") channel_created = any(u == channel_id for u in created_channels) if channel_id is not None and channel_id != "" and \ not channel_created and \ any(x.tvg_id.lower() == channel_id.lower() for x in tvg_id_unique_entries): output_str( "creating channel element for {}".format(channel_id)) epg_channel_count += 1 new_channel = SubElement(new_root, "channel") new_channel.set( "id", channel_id.lower() if not args.preserve_case else channel_id) for elem in channel: new_elem = SubElement(new_channel, elem.tag) elem_text = elem.text if new_elem.tag.lower() == "display-name": elem_text = transform_string_value( elem_text, None, args.channel_transforms) new_elem.text = elem_text for attr_key in elem.keys(): attr_val = elem.get(attr_key) if elem.tag.lower() == "icon" and args.http_for_images: attr_val = attr_val if attr_val.startswith( "http") else "" new_elem.set(attr_key, attr_val) created_channels.append(channel_id) if args.no_tvg_id and args.force_epg: # create a channel element for every channel present in the m3u where there is no tvg_id and where there is a tvg_name value for entry in m3u_entries: if entry.tvg_id is None or entry.tvg_id == "" or entry.tvg_id == "None": output_str( "creating channel element for m3u entry from tvg-name value {}" .format(entry.tvg_name)) epg_channel_count += 1 new_channel = SubElement(new_root, "channel") new_channel.set("id", entry.tvg_name) new_elem = SubElement(new_channel, "display-name") new_elem.text = entry.tvg_name if epg_channel_count > 0: # perform any specified channel element sorting if args.xml_sort_type == 'alpha': channels = new_root.findall("channel[@id]") alpha_sorted_channels = sorted( channels, key=lambda ch_elem: (ch_elem.tag, ch_elem.get('id'))) new_root[:] = alpha_sorted_channels elif args.xml_sort_type == 'm3u': channels = new_root.findall("channel[@id]") m3u_sorted_channels = sorted( channels, key=lambda ch_elem: (ch_elem.tag, [ x.tvg_id.lower() for x in tvg_id_unique_entries ].index(ch_elem.get('id').lower()))) new_root[:] = m3u_sorted_channels all_epg_programmes_xpath = 'programme' all_epg_programmes = original_tree.findall( all_epg_programmes_xpath) if len(all_epg_programmes) > 0 and not args.preserve_case: # force the channel (tvg-id) attribute value to lowercase to enable a case-insensitive # xpath lookup with: channel_xpath = 'programme[@channel="' + entry.tvg_id.lower() + '"]' for programme in all_epg_programmes: for attr_key in programme.keys(): attr_val = programme.get(attr_key) if attr_key.lower( ) == 'channel' and attr_val is not None: programme.set(attr_key, attr_val.lower()) # now copy all programme elements from the original epg for every channel present in the m3u no_epg_channels = [] max_programme_start_timestamp = datetime.datetime.now( tzlocal.get_localzone()) - datetime.timedelta(days=365 * 10) programme_count = 0 for entry in tvg_id_unique_entries: if entry.tvg_id is not None and entry.tvg_id != "" and entry.tvg_id != "None": output_str("creating programme elements for {}".format( entry.tvg_name)) # case-insensitive xpath search channel_xpath = entry.tvg_id.lower( ) if not args.preserve_case else entry.tvg_id channel_xpath = 'programme[@channel="' + channel_xpath + '"]' channel_programmes = original_tree.findall(channel_xpath) if len(channel_programmes) > 0: for elem in channel_programmes: programme_start_timestamp = dateutil.parser.parse( elem.get("start")) max_programme_start_timestamp = programme_start_timestamp if programme_start_timestamp > max_programme_start_timestamp else max_programme_start_timestamp if is_in_range(args, programme_start_timestamp): programme_count += 1 programme = SubElement(new_root, elem.tag) for attr_key in elem.keys(): attr_val = elem.get(attr_key) programme.set(attr_key, attr_val) for sub_elem in elem: new_elem = SubElement(programme, sub_elem.tag) new_elem.text = sub_elem.text for attr_key in sub_elem.keys(): attr_val = sub_elem.get(attr_key) new_elem.set(attr_key, attr_val) for sub_sub_elem in sub_elem: new_sub_elem = SubElement( new_elem, sub_sub_elem.tag) new_sub_elem.text = sub_sub_elem.text for attr_key in sub_sub_elem.keys(): attr_val = sub_sub_elem.get(attr_key) new_sub_elem.set(attr_key, attr_val) else: if not args.no_tvg_id or not args.force_epg: no_epg_channels.append(entry) else: if not args.no_tvg_id or not args.force_epg: no_epg_channels.append(entry) if args.no_tvg_id and args.force_epg: # create programme elements for every channel present in the m3u where there is no tvg_id and where there is a tvg_name value for entry in m3u_entries: if entry.tvg_id is None or entry.tvg_id == "" or entry.tvg_id == "None": output_str( "creating pseudo programme elements for m3u entry {}". format(entry.tvg_name)) programme_start_timestamp = datetime.datetime.now( tzlocal.get_localzone()) programme_stop_timestamp = programme_start_timestamp + datetime.timedelta( hours=2) max_programme_start_timestamp = max_programme_start_timestamp if programme_start_timestamp > max_programme_start_timestamp else programme_start_timestamp for i in range( 1, 84 ): # create programme elements within a max 7 day window and no more limited by the configured range if is_in_range(args, programme_start_timestamp): programme_count += 1 programme = SubElement(new_root, "programme") programme.set( "start", programme_start_timestamp.strftime( "%Y%m%d%H0000 %z")) programme.set( "stop", programme_stop_timestamp.strftime( "%Y%m%d%H0000 %z")) programme.set("channel", entry.tvg_name) title_elem = SubElement(programme, "title") title_elem.text = entry.tvg_name desc_elem = SubElement(programme, "desc") desc_elem.text = entry.tvg_name programme_start_timestamp = programme_start_timestamp + datetime.timedelta( hours=i * 2) now = datetime.datetime.now(tzlocal.get_localzone()) range_start = now - datetime.timedelta(hours=args.range) range_end = now + datetime.timedelta(hours=args.range) output_str( 'configured epg programme start/stop range is +/-{0}hrs from now ({1} <-> {2})' .format(args.range, range_start.strftime("%d %b %Y %H:%M"), range_end.strftime("%d %b %Y %H:%M"))) output_str('latest programme start timestamp found was: {0}'.format( max_programme_start_timestamp.strftime("%d %b %Y %H:%M"))) output_str( '{0} programmes were added to the epg'.format(programme_count)) indent(new_root) tree = ElementTree(new_root) if len(no_epg_channels) > 0: save_no_epg_channels(args, no_epg_channels) return tree except Exception as e: # likely a mangled xml parse exception output_str("epg creation failure: {0}".format(e)) return None
# Copyright (C) 2012-2018 by Dr. Dieter Maurer <*****@*****.**>; see 'LICENSE.txt' for details """Auxiliary classes to construct signature/encryption templates.""" from lxml.etree import ElementBase, \ parse as et_parse, fromstring as et_fromstring, XML as et_xml, \ XMLParser, ElementNamespaceClassLookup, ElementDefaultClassLookup from dm.xmlsec.binding import DSigNs, dsig, EncNs, enc # set up our own parser and related `etree` infrastructure parser = XMLParser() # apparently, `parser` has a `set_element_class_lookup` but not corresponding `get` #class_lookup = ElementNamespaceClassLookup(parser.get_element_class_lookup()) class_lookup = ElementNamespaceClassLookup(ElementDefaultClassLookup()) parser.set_element_class_lookup(class_lookup) Element = parser.makeelement def SubElement(node, *args, **kw): node.append(Element(*args, **kw)) def parse(file, parser=parser): return et_parse(file, parser=parser) def fromstring(s, parser=parser): return et_fromstring(s, parser=parser) def XML(s, parser=parser):
def load_xml_from_file(file): return parse(file, parser=XMLParser(strip_cdata=False))