def prepareNS(): """ This function will register additional namespaces. You must only call it once, but it has no effect if you do it more. :return: None """ ns = etree.FunctionNamespace('http://bugsutils.de/') ns['strtodoc'] = xpath_strtodoc ns = etree.FunctionNamespace('http://www.wxwindows.org/wxxrc')
def get_cover(html_root): lowercase_ns = etree.FunctionNamespace(None) lowercase_ns["lower-case"] = lambda _, n: n[0].lower() if n and len( n) else "" images = html_root.xpath( "//img[contains(lower-case(@id), 'cover') or contains(lower-case(@class), 'cover') or" "contains(lower-case(@name), 'cover') or contains(lower-case(@src), 'cover') or" "contains(lower-case(@alt), 'cover')]") if len(images): return images[0] divs = html_root.xpath( "//div[contains(lower-case(@id), 'cover') or contains(lower-case(@class), 'cover') or" "contains(lower-case(@name), 'cover') or contains(lower-case(@src), 'cover')]//img" ) if len(divs): return divs[0] a = html_root.xpath( "//a[contains(lower-case(@id), 'cover') or contains(lower-case(@class), 'cover') or" "contains(lower-case(@name), 'cover') or contains(lower-case(@src), 'cover')]//img" ) if len(a): return a[0] return None
def transform(site): # get files for particular site xml_filename = site.name + ".xml" xsl_filename = 'rss-to-pubs.xsl' xml_output_filename = site.name + "_out.xml" # Person resolver for current site to get names pr = PersonResolver(site) pr.load_matches() # Add custom functions to XSLT context ns = ET.FunctionNamespace("python") ns['get_date'] = get_date ns['lookup_person'] = pr.lookup_person ns['clean_html'] = clean_html # Transform XML xml = ET.parse(xml_filename) transform = ET.XSLT(ET.parse(xsl_filename)) # pass dynamic parameters to XSLT, e.g. root Org ID transformed = transform(xml, site=ET.XSLT.strparam(site.name), root_org=ET.XSLT.strparam(site.root_org)) # Save transformed XML file transformed.write(xml_output_filename, pretty_print=True, xml_declaration=True, encoding="utf-8")
def sprinkle_terms(xml): """ Takes in a etree.Element and sprinkles in hints for the index wherever a term is used """ terms = set() for i, obj in enumerate(TERM_XPATH(xml)): s = obj.strip() if s != '': terms.add(s) def term_finder(context, s): results = [] r = s[0] # Remaining parts of the string # TODO: keep looping found = True while found: found = False for t in terms: if t in r: found = True #found the term in the string! index = r.find(t) if index > 0: results.append(r[:index]) r = r[index+len(t):] results.append(t) glossterm = etree.Element('{%s}indexterm' % NAMESPACES['db']) index = etree.SubElement(glossterm, '{%s}primary' % NAMESPACES['db']) index.text = t results.append(glossterm) results.append(r) return results ns = etree.FunctionNamespace('uri:custom-func') # register global namespace ns['term-finder'] = term_finder # define function in new global namespace return stylesheet(xml)
def scrape_instagram(user_id: str, session, url_set: UrlSet) -> None: domain = 'www.instagram.com' base = f'https://{domain}' url = f'{base}/{user_id}/' logger = logging.getLogger(__name__) response = session.get(url) root = get_html_dom_content(response) # scrape.utils.print_element(root) # register regular expressions with lxml # this means that we can use regular expression functions like 'match' # by specifying 're:match' in our xpath expressions ns = etree.FunctionNamespace("http://exslt.org/regular-expressions") ns.prefix = 're' e_a = root.xpath('//script[re:match(text(), "^window._sharedData")]') assert len(e_a) == 1 data = e_a[0].text d = json.loads(data[data.find('{'):data.rfind('}') + 1]) my_list = d['entry_data']['ProfilePage'] assert len(my_list) == 1 c = my_list[0]["graphql"]["user"] if 'profile_pic_url_hd' in c: url_set.append(c['profile_pic_url_hd']) elif 'profile_pic_url' in c: url_set.append(c['profile_pic_url']) user_id = c['id'] get_urls(logger, session, base, url_set, user_id)
def transform_xml(xml, name, classifications, lang='en-GB', translated='', validate=False): ET.ElementTree(xml).write(name + '.xml') language, country = localize(lang) language2, country2 = localize(translated) # Add custom functions to XSLT context ns = ET.FunctionNamespace("python") ns['get_client_id_uri'] = get_client_id_uri transform = ET.XSLT(ET.parse(name + '_masterlist.xsl')) trans_xml = transform(xml, language=ET.XSLT.strparam(language), language2=ET.XSLT.strparam(language2), country=ET.XSLT.strparam(country), country2=ET.XSLT.strparam(country2), translated=ET.XSLT.strparam(str(translated != ''))) out = name + '_converted.xml' trans_xml.write(out, pretty_print=True, xml_declaration=True, encoding="utf-8", standalone=True) if validate: validate_xml(out, name + ".xsd")
def content(self): """ Create response """ global svg_stylesheet global defs_stylesheet ns = etree.FunctionNamespace("http://www.praterm.com.pl/ISL/pxslt") ns.prefix = 'pxslt' ns['isl_vu'] = self.isl_vu fc = util.FieldStorage(self.req) req_type = fc.getfirst('type') if req_type == 'defs': stylesheet = XSLT(defs_stylesheet) else: stylesheet = XSLT(svg_stylesheet) # this should be set in Apache configuration file using # PythonOption directive, for example: # PythonOption szarp.paramd.uri "http://localhost:8081/" paramd_uri = self.req.get_options()['szarp.paramd.uri'] fname = 'file:///etc/szarp/default/config' + self.req.parsed_uri[ apache.URI_PATH] doc = etree.parse(fname, parser) r = 'test' try: r = stylesheet(doc, uri="'" + paramd_uri + "'") except Exception, e: self.req.log_error('content: stylesheet failed %s' % str(e), apache.APLOG_ERR) r = stylesheet.error_log
def __init__(self, session, config, parent): Transformer.__init__(self, session, config, parent) xfrPath = self.get_path(session, "xsltPath") if xfrPath is None: raise ConfigFileException("Missing path 'xsltPath' for " "{0}.".format(self.id)) if os.path.isabs(xfrPath): path = xfrPath else: dfp = self.get_path(session, "defaultPath") path = os.path.join(dfp, xfrPath) ns = etree.FunctionNamespace( 'http://www.cheshire3.org/ns/function/xsl/') ns['now'] = myTimeFn self.functionNamespace = ns self.parsedXslt = etree.parse(path) self.txr = etree.XSLT(self.parsedXslt) self.params = None parameter = self.get_setting(session, 'parameter', None) if (parameter): self.params = {} kv = parameter.split(' ') for pair in kv: (k, v) = pair.split(':') self.params[k] = '"%s"' % v
def transform(self, sourcedoc, template, functions): # We have to build these three methods here, or else we don't have # the e available when called back from XSLT def gete(dummy, key): return self._gete(self.e, key) def getform(dummy, key): return self._getform(self.e, key) def getsession(dummy, *args): return self._getsession(self.e, args) # Build the name space for the three get functions prefixmap = {'xp': XIERPANAMESPACE} ns = etree.FunctionNamespace(XIERPANAMESPACE) ns['e'] = gete ns['form'] = getform ns['session'] = getsession # Add all required extension functions if functions is not None: for name, function in functions.items(): ns[name] = function if template is None: resultdoc = None else: # <todo>TODO</todo>: Maybe add caching of style docs in the future? styledoc = etree.parse(self.e.path2fspath(template), self.parser) # style = etree.XSLT(styledoc, prefixmap) style = etree.XSLT(styledoc) resultdoc = style.apply(sourcedoc) return resultdoc
def soran_transform_dir(dir_name: Path, oname: str, out_dir: str, codeNEB: str, title_name: str, temp_path: str) -> Optional[Path]: src_root = construct_dir_source(dir_name, temp_path, title_name) ns = etree.FunctionNamespace('http://promsoft.ru/soran_transform/dir') ns.prefix = 'ps' journal = JornalParser(ns) parser = etree.XMLParser(no_network=True) parser.resolvers.add(FileResolver()) xslt = etree.XML(XSLT, parser) # xslt = etree.parse('convert_dir.xslt', parser) transform = etree.XSLT(xslt) parser = etree.HTMLParser() # src_root = etree.parse(iname, parser) # res_trans = transform(src_root, params=etree.XSLT.strparam(openstat)) res_trans = transform(src_root) if oname == '-': print(res_trans) return if not oname: if not codeNEB: codeNEB = journal.codeNEB oname = f'{codeNEB}_{datetime.now().date().isoformat()}_unicode.xml' opath = Path(out_dir) / oname if out_dir else Path(oname) with opath.open('wt', encoding='utf-8') as out: out.write('%s' % res_trans) logging.info('Создан файл: %s', opath) return opath
def gameData(self, gameId): """Display a Game details in XML format: https://www.mythtv.org/wiki/MythTV_Universal_Metadata_Format Returns nothing """ with requests.Session() as ReqSession: url = self.config['dataURL'] % gameId params = {} params["api_key"] = self.config['apikey'] params["format"] = 'xml' headers = self.config['headers'] res = ReqSession.get(url, params=params, headers=headers) try: videoResult = etree.fromstring(res.content) except Exception as errmsg: sys.stderr.write(u"! Error: Invalid XML was received from www.giantbomb.com (%s)\n" % errmsg) sys.exit(1) gameXslt = etree.XSLT(etree.parse(u'%s/XSLT/giantbombGame.xsl' % self.baseProcessingDir)) gamebombXpath = etree.FunctionNamespace('https://www.mythtv.org/wiki/MythTV_Universal_Metadata_Format') gamebombXpath.prefix = 'gamebombXpath' self.buildFuncDict() for key in list(self.FuncDict.keys()): gamebombXpath[key] = self.FuncDict[key] items = gameXslt(videoResult) if items.getroot() is not None: if len(items.xpath('//item')): sys.stdout.write(etree.tostring(items, encoding='UTF-8', method="xml", xml_declaration=True, pretty_print=True, )) sys.exit(0)
def write_out(self, page, xml_subpages, output): """Banana banana """ # pylint: disable=missing-docstring def subpages(_): return xml_subpages namespace = etree.FunctionNamespace('uri:hotdoc') namespace['subpages'] = subpages html_output = os.path.join(output, 'html') rel_path = os.path.join(self.get_output_folder(page), page.link.ref) cached_path = os.path.join(self.__cache_dir, rel_path) full_path = os.path.join(html_output, rel_path) if not os.path.exists(os.path.dirname(full_path)): os.makedirs(os.path.dirname(full_path)) with open(cached_path, 'r', encoding='utf-8') as _: doc_root = etree.HTML(_.read()) self.__validate_html(self.extension.project, page, doc_root) self.writing_page_signal(self, page, full_path, doc_root) with open(full_path, 'w', encoding='utf-8') as _: transformed = str(self.__page_transform(doc_root)) _.write('<!DOCTYPE html>\n%s' % transformed)
def __init__(self, path): ''' Constructor ''' self._path = path if not os.path.exists(path): print('No existing file at {0}. Creating file...'.format(path)) self.create_new_file() print('New file created at {0}'.format(path)) self._data = etree.parse(path) #minidom.parse(path) self._root = self._data.getroot() self._monthNodes = self._root.iter('month') self._indent_after_treatment = True def lower_case(context, text): """ XPath extension method to enable case-insensitive XPath search as explained at: http://lxml.de/extensions.html """ result = [] for t in text: result.append(t.lower()) return result ns = etree.FunctionNamespace(None) ns['lower-case'] = lower_case
def parse_xml(self): temas = self.xml.xpath('/temas/tema') ns = etree.FunctionNamespace("urn:federico") ns.prefix = 'f' ns['slugify'] = slug stylesheet = etree.parse(self.stylesheet) transform = etree.XSLT(stylesheet) for tema in temas: self.create_files(tema, transform)
def __init__(self, elastic): super().__init__(elastic, initial=True) # TODO remove min_page tag. self.entry_resource = PaginatedResource(URL_TEMPLATE, min_page=2338, max_page=9999) # register a string-join function for the lxml XPath ns = etree.FunctionNamespace(None) ns["string-join"] = _string_join
def run(self): """Run method that loads and starts the plugin""" if not self.pluginIsActive: self.pluginIsActive = True #print "** STARTING islh_parser" # dockwidget may not exist if: # first run of plugin # removed on close (see self.onClosePlugin method) if self.dockwidget == None: # Create the dockwidget (after translation) and keep reference self.dockwidget = islh_parserDockWidget() # connect to provide cleanup on closing of dockwidget self.dockwidget.closingPlugin.connect(self.onClosePlugin) # show the dockwidget # TODO: fix to allow choice of dock location self.iface.addDockWidget(Qt.LeftDockWidgetArea, self.dockwidget) self.dockwidget.show() #buttons actions self.dockwidget.input_file.clear() self.dockwidget.input_file_button.clicked.connect( self.select_input_xml) self.dockwidget.input_read.clicked.connect(self.read_islh) self.dockwidget.input_lhc.activated.connect(self.select_lhc) self.dockwidget.input_odd.activated.connect(self.select_odd) self.dockwidget.input_dil.activated.connect(self.select_dil) self.dockwidget.input_por.activated.connect(self.select_por) self.dockwidget.input_psk.activated.connect(self.select_psk) self.dockwidget.input_hk_button.clicked.connect(self.show_hk) #load xslt stuff self.ns = etree.FunctionNamespace("http://ciselniky") self.ns.prefix = "cis" self.ns['lesni_oblast'] = ciselniky.lesni_oblast self.ns['slt'] = ciselniky.slt self.ns['katuze'] = ciselniky.katuze self.ns['lvs'] = ciselniky.lvs self.ns['zvl_statut'] = ciselniky.zvl_statut self.xslt_root = etree.XML( open('%s/xslt/hk.xsl' % self.plugin_dir, 'r').read()) self.transform = etree.XSLT(self.xslt_root) #nastavim barvu na neco, co neni v por mape self.iface.mapCanvas().setSelectionColor(QColor('#f40')) from karto_fce import *
def initialize(): """ Initializes global values such as prepared_checks and parser to avoid doing it for each file. """ global sdsc_initialized if sdsc_initialized: return True # Prepare parser (add py: namespace) ns = etree.FunctionNamespace( 'https://www.github.com/openSUSE/suse-doc-style-checker') ns.prefix = 'py' ns.update( dict(linenumber=linenumber, termcheck=termcheck, buildtermdata=buildtermdata, dupecheck=dupecheck, sentencelengthcheck=sentencelengthcheck, sentencesegmenter=sentencesegmenter, tokenizer=tokenizer, counttokens=counttokens, splitpath=splitpath)) global parser parser = etree.XMLParser(ns_clean=True, remove_pis=False, dtd_validation=False) # Prepare all checks global prepared_checks prepared_checks = [] location = os.path.dirname(os.path.realpath(__file__)) checkfiles = glob.glob(os.path.join(location, 'xsl-checks', '*.xslc')) if not checkfiles: printcolor( "! No check files found.\n Add check files to " + os.path.join(location, 'xsl-checks'), 'error') return False for checkfile in checkfiles: try: checkmodule = re.sub(r'^.*/', r'', checkfile) checkmodule = re.sub(r'.xslc$', r'', checkmodule) transform = etree.XSLT(etree.parse(checkfile, parser)) prepared_checks.append({ 'name': checkmodule, 'transform': transform }) except Exception as error: printcolor("! Syntax error in check file.\n " + checkfile, 'error') printcolor(" " + str(error), 'error') sdsc_initialized = True return True
def soran_transform_page(iname, oname): ns = etree.FunctionNamespace('http://promsoft.ru/soran_transform/page') ns.prefix = 'ps' journal = JornalParser(ns) xslt = etree.XML(XSLT) transform = etree.XSLT(xslt) parser = etree.HTMLParser() src_root = etree.parse(iname, parser) # res_trans = transform(src_root, params=etree.XSLT.strparam(openstat)) res_trans = transform(src_root) open(oname, 'wt', encoding='utf-8').write('%s' % res_trans)
def parse_wfst_response(schema_xml_str): xml = etree.XML(schema_xml_str) tree = etree.ElementTree(xml) root = tree.getroot() for ns in root.nsmap: xpath_ns = etree.FunctionNamespace(root.nsmap[ns]) xpath_ns.prefix = ns summary_element = tree.xpath('//wfs:TransactionResponse/wfs:TransactionSummary') summary = {} for child in summary_element[0].getchildren(): summary[child.tag.split('}')[1]] = child.text return summary
def parse_schema(schema_xml_str): xml = etree.XML(schema_xml_str) tree = etree.ElementTree(xml) root = tree.getroot() for ns in root.nsmap: xpath_ns = etree.FunctionNamespace(root.nsmap[ns]) xpath_ns.prefix = ns sequences = tree.xpath('//xsd:schema/xsd:complexType/xsd:complexContent/xsd:extension/xsd:sequence/xsd:element') schema_source = {} for element in sequences: schema_source[element.attrib['name']] = element.attrib['type'] return schema_source
def transform(document, template_name, template_context=None): ns = etree.FunctionNamespace('http://mollyproject.org/xpath#') ns.prefix = 'molly' ns['url'] = url_func ns['safe-href'] = safe_href # Load a template and turn it into an XSL template template = loader.get_template(template_name) template = template.render(Context(template_context or {})) template = etree.XSLT(etree.XML(template)) return template(document)
def register_xpath_namespaces(): fns = { 'date': 'http://exslt.org/dates-and-times', 'dyn': 'http://exslt.org/dynamic', 'exsl': 'http://exslt.org/common', 'func': 'http://exslt.org/functions', 'math': 'http://exslt.org/math', 'random': 'http://exslt.org/random', 're': 'http://exslt.org/regular-expressions', 'set': 'http://exslt.org/sets', 'str': 'http://exslt.org/strings' } for k, v in fns.iteritems(): etree.FunctionNamespace(v).prefix = k
def setup(config): # TODO: find a way that doesn't require storing global # (Can we pass a config reference in the context?) global _config _config = config ns = etree.FunctionNamespace('urn:mrbavii:xmlsite') ns['base-uri'] = base_uri ns['rbase-uri'] = rbase_uri ns['dirname'] = dirname ns['basename'] = basename ns['highlight_code'] = highlight_code ns['highlight_file'] = highlight_file
def __init__(self): self.schema_changes = [] self.dom_tree = None self.model = None self.attributes = [] # the schema global attributes self.elements = [] # the schema global elements self.complex = [] # the schema global complex types self.enums = {} # Dictionary of global enums self.dict = {} # The schema element dictionary of legal types self.uri = '' self.namespace = {} # The namespaces used by the OCX self.version = 'No version' # The schema version of the parsed xsd self.logger = logging.getLogger('schema_validation') self.ns = ET.FunctionNamespace(None) self.ns['description'] = self.description
def full_description(self): ns_uri = "http://schema.boltlinux.org/2011/XSL/BoltPack" ns = etree.FunctionNamespace(ns_uri) ns.prefix = "bolt" ns["block_format"] = PackageDescription.CustomXPath.block_format inline_elements_transform = etree.XSLT( etree.fromstring(PackageDescription.INLINE_ELEMENTS_STYLE)) block_elements_transform = etree.XSLT( etree.fromstring(PackageDescription.BLOCK_ELEMENTS_STYLE)) result = block_elements_transform( inline_elements_transform(self.desc_node)) return str(result)
def _transform(self, results): """ Method: Implement the abstract method. Outputs: Returns a list of results (title, content) """ transformation = [] self.data = [] try: for result in results: xml = result['content'].encode('utf-8') if xml != "": self.data.append(etree.XML(xml)) #Take the previous file and generate one POP file dir = os.path.join(RESOURCES_PATH, 'xslt/xml2pop.xsl') xslt = open(dir, 'r') contentXslt = xslt.read() xslt.close() #Parse XSLT xsltParsed = etree.parse(BytesIO(contentXslt.encode('utf-8'))) transform = etree.XSLT(xsltParsed) #Get current date and time from_zone = tz.tzutc() to_zone = tz.tzlocal() datetimeUTC = datetime.datetime.now().replace(tzinfo=from_zone) datetimeLocal = datetimeUTC.astimezone(to_zone) now = datetimeLocal.strftime("%m-%d-%Y %I:%M %p") #Args args = ({"Date": "\"" + now + "\""}) dom = etree.XML(xml) ns = etree.FunctionNamespace( 'uri:params') # register global namespace ns['params'] = self.params # define function in new global namespace #Transformation with arguments. Used to transmit the date to the XSLT file newdom = transform(dom, **(args)) transformation.append({ 'title': 'Results.pop', 'content': str(newdom) }) except etree.ParseError as e: raise except: raise return transformation
def update(self, env): """ On update, we have to re-build our entire filter list """ self.env = env self.filters = {} self.parser = etree.XMLParser() self.parser.resolvers.add(S3FilterResolver()) self.parser.resolvers.add(PythonFilterResolver()) self.external_functions = [] if self.env.config.has_key("xsltfunctions"): for func_path in self.env.config['xsltfunctions']: __import__(func_path) funcset = find_class(func_path) ns = etree.FunctionNamespace(funcset.uri) for fname in funcset.functions: ns[fname] = funcset.functions[fname]
def loadfile(self, sFile): """Load a file into xml""" try: # Load the UTF8 file into memory with open(sFile, "rb") as fp: sXml = fp.read() self.xmldocument = ET.fromstring(sXml) # Add a namespace for custom function(s) ns = ET.FunctionNamespace(None) # Add the function matches() to the namespace ns['matches'] = self.matches return self.xmldocument except: errHandle.DoError("XmlProcessor/loadfile") return None
def loadstring(self, sXml): # Make sure to parse blanks away!! parser = ET.XMLParser(remove_blank_text=True) sXml = sXml.replace("\n", "") try: self.xmldocument = ET.fromstring(sXml, parser) except: self.xmldocument = ET.fromstring(sXml.encode("utf-8"), parser) # Add a namespace for custom function(s) ns = ET.FunctionNamespace(None) # Add the function matches() to the namespace ns['matches'] = self.matches # DEBUGGING: # x = ET.tostring(self.xmldocument, xml_declaration=True, encoding="utf-8", pretty_print=False).decode("utf-8") return True
def register_builtins(): ns = et.FunctionNamespace(PYTHON_BUILTINS_NS) tostring = et.tostring str_xpath = et.XPath("string()") def make_string(s): if isinstance(s, list): if not s: return u'' s = s[0] if not isinstance(s, unicode): if et.iselement(s): s = tostring(s, method="text", encoding=unicode) else: s = unicode(s) return s def wrap_builtin(b): def wrapped_builtin(_, *args): return b(*args) return wrapped_builtin for (name, builtin) in vars(__builtins__).iteritems(): if callable(builtin): if not name.startswith('_') and name == name.lower(): ns[name] = wrap_builtin(builtin) def wrap_str_method(b): def wrapped_method(_, *args): args = tuple(map(make_string, args)) return b(*args) return wrapped_method for (name, method) in vars(unicode).iteritems(): if callable(method): if not name.startswith('_'): ns[name] = wrap_str_method(method) def within(_, s, a, b): return make_string(a) <= make_string(s) <= make_string(b) ns["within"] = within