def xsltparse(xsltpath, xmlpath, resultpath): name = "xsltparse" log = logging.getLogger(__name__ + "." + name) if not os.path.isfile(xsltpath): log.error("xsltpath " + str(xsltpath) + " is not a valid file!") return if not os.path.isfile(xmlpath): log.error("xmlpath " + str(xmlpath) + " is not a valid file!") return try: xslt = etree.parse(xsltpath) trans = etree.XSLT(xslt) # https://lxml.de/parsing.html parser = etree.XMLParser(ns_clean=True, huge_tree=True, recover=True) source = etree.parse(xmlpath, parser) html = trans(source) html.write(resultpath) except Exception as e: log.exception("Failed to parse xml using xslt!") return log.debug("Parsed xml written to" + str(resultpath))
def get_deps_html(url, dest): html = lxml.html.parse(dest) for u in html.xpath("//img/@src | //input/@src | //link/@href | //object/@data | //script/@src"): _url = urllib.parse.urljoin(url, u) urlp = urllib.parse.urlparse(_url) if urlp.netloc and urlp.netloc != "access.redhat.com": continue q.put((download, _url, os.path.normpath(urlp.path[1:]))) update_url_html(u, os.path.split(dest)[0]) html.write(dest)
def post2markdown( tree ): # Process html; Keep only the <article> content - where blogpost actualy is article = (tree.xpath('//article'))[0] header = (tree.xpath('//header[@class="article-header"]'))[0] p_blog = (article.xpath('.//p[@id="breadcrumb"]'))[0] # contains: "Blog:" header.remove(p_blog) if (article.xpath('.//footer')): footer = (article.xpath('.//footer'))[0] article.remove(footer) iframes = article.xpath('//iframe') post_videos(iframes) # videos: replace video's iframe with <a><img> images = (article.xpath('.//img')) post_imgs(images) post_clean_html(article) # author # add class to author wrapping <a> author_tag = (article.xpath('.//span[@class="author"]/a')) # author_tag[0].set('class', 'author') # author_tag[0].attrib.pop('rel') # author_tag[0].attrib.pop('title') # author_tag[0].set('title', '') # print lxml.html.tostring(author_tag[0]) # get info date = ((article.xpath('//time'))[0]).attrib['datetime'] author = (article.xpath('//a[@rel="author"]'))[0].text title = (article.xpath('//h1[@class="entry-title single-title"]'))[0].text #save modified html html_article = lxml.html.tostring(article, pretty_print=True, include_meta_content_type=True, encoding='utf-8', method='html', with_tail=False) html = open('tmp_article.html', 'w') html.write(html_article) html.close() return (date, author, title)
event.add( 'SUMMARY', event_info.xpath("span[contains(@class, 'discipline')]")[0].text) kind = event_info.xpath("span[contains(@class, 'kindOfWork')]")[0].text lecturer = event_info.xpath( "span[contains(@class, 'lecturer')]")[0].text group_info = event_info.xpath("span[contains(@class, 'group')]") group = group_info[0].text if group_info else '' event.add('DESCRIPTION', '\n'.join([kind, lecturer, group])) event.add('COMMENT', status) cal.add_component(event) with open(f'{group_id}.ics', 'w+b') as ics: ics.write(cal.to_ical()) with open('index.html', 'w', encoding='utf-8') as html: html.write(f''' <!DOCTYPE html> <head> <meta charset="utf-8"> </head> <body> <ul> <li> <a href="{group_id}.ics">{group_name}</a> ({status}) </li> </ul> </body> ''')
last_script = script_list[-1] add_link_script = """ var progs = data["programs"]; for (var i = 0; i < progs.length; i++) { var prog = progs[i]; if (prog["outputs"].length > 0) { var outputs = prog["outputs"]; for (var j = 0; j < outputs.length; j++) { outputs[j]["file"] = "%s\/" + outputs[j]["file"] } } } """%(os.path.relpath(meme_path,os.path.dirname(reSt_html_path))) script_el = lxml.html.builder.SCRIPT(add_link_script) last_script.addnext(script_el) html.write("meme-chip_index.html",method="html") doc.add(ReStRaw(format="html",options={'file':"meme-chip_index.html"})) #end if (os.path.exists(meme_index_path)): doc.write() doc.close() # 6. convert reSt to PDF and HTML # rst2html_call = 'rst2html.py --stylesheet-path=/nfs/antdata/web_stage/css/lsr.css ' \ # '%s %s'%(reSt_path,reSt_html_path) rst2html_call = 'rst2html.py --stylesheet-path=%s ' \ '%s %s'%(css_path,reSt_path,reSt_html_path) sys.stderr.write(rst2html_call+'\n') r = call(rst2html_call,shell=True)