def on_response(self, resp, req): ctype = resp.headers.iget('content-type') if not ctype: return ctype = ctype.split(";", 1)[0] # if this is an html page, parse it if ctype in HTML_CTYPES: body = resp.body_string() html = lxml.html.fromstring(body) # rewrite links to absolute html.rewrite_links(self.rewrite_link) # add base old_base = html.find(".//base") base = etree.Element("base") base.attrib['href'] = self.absolute_path if not old_base: head = html.find(".//head") head.append(base) # modify response rewritten_body = lxml.html.tostring(html) try: resp.headers.ipop('content-length') except KeyError: pass resp.headers['Content-Length'] = str(len(rewritten_body)) resp._body = StringIO(rewritten_body) resp._already_read = False
def _post_process_html(self, content): html = lxml.html.fromstring(content) if self.links: html.rewrite_links(self._map_cid) for link in html.iterlinks(): link[0].set("target", "_blank") else: html.rewrite_links(lambda x: None) safe_attrs = list(defs.safe_attrs) + ["class", "style"] cleaner = Cleaner( scripts=True, javascript=True, links=True, page_structure=True, embedded=True, frames=True, add_nofollow=True, safe_attrs=safe_attrs ) mail_text = lxml.html.tostring( cleaner.clean_html(html), encoding="unicode") with open("/tmp/output.txt", "w") as fp: fp.write(mail_text) return smart_text(mail_text)
def show_proxy_response(_url, status_code, headers, output, start_response): import lxml.html if "content-type" in headers and u'text/html' in headers.get( "content-type"): html = lxml.html.fromstring(output) html.rewrite_links(RewriteLink(get_base_url(_url))) output = lxml.html.tostring(html) return show_response(status_code, headers, output, start_response)
def html_to_lxml(url, text, clean=False): """Parse plain-text HTML into an `lxml` tree.""" if clean: text = _text_from_sp(('pandoc', '--from=html', '--to=html5'), text.encode()) html = lxml.html.document_fromstring(text) # Endless loops ahoy html.rewrite_links(lambda s: '' if urldefrag(s).url == url else s, base_href=url) return html
def rewrite_resource_paths(content, base=None): """Given the content and a new base reference, rewrite the source paths in the content. """ html = lxml.html.fromstring(content) def repl(link): if link.startswith('#') or link.startswith('//'): return link if base is not None: return "{0}/{1}".format(base.rstrip('/'), link) else: return link html.rewrite_links(repl, resolve_base_href=False) return lxml.html.tostring(html)
def get_translate_page(tran_page_url, user): htmlStr = get_html_str(tran_page_url) html = lxml.html.fromstring(htmlStr) #重写链接 html.rewrite_links(change_url, base_href=tran_page_url) add_to_request_history(html, tran_page_url, user) change_script_data_main_url(html, tran_page_url) #变更所有标签 change_all_element(html, user, tran_page_url) #添加 css add_css_js(html) return lxml.html.tostring(html)
def execute(self): rewrite, headers = self.rewrite_headers() if not headers: msg = "HTTP/1.1 502 Gateway Error\r\n\r\n bad request." self.resp.send(msg) return if rewrite: body = self.parser.body_string() if not body: rewritten_body = '' else: html = lxml.html.fromstring(body) # rewrite links to absolute html.rewrite_links(self.rewrite_link) # add base absolute_path = "%s%s" % (self.local_base, self.extra.get('path', '')) old_base = html.find(".//base") base = etree.Element("base") base.attrib['href'] = absolute_path if not old_base: head = html.find(".//head") head.append(base) # modify response rewritten_body = bytes(lxml.html.tostring(html)) # finally send response. headers.extend([ 'Content-Length: %s\r\n' % len(rewritten_body), "\r\n"]) self.resp.writeall(bytes("".join(headers))) stream = io.BytesIO(rewritten_body) while True: data = stream.read(io.DEFAULT_BUFFER_SIZE) if not data: break self.resp.writeall(data) else: self.resp.writeall(bytes("".join(headers) + "\r\n")) body = self.parser.body_file() send_body(self.resp, body, self.parser.is_chunked())
def rewrite_language_links(html, language_code): if language_code: html = rewrite_links( html, lambda lnk: LANGUAGE_LINK_RE.sub(u'/' + language_code + u'/', lnk)) return mark_safe(html)
def rewrite_language_links(html, language_code): if language_code: html = rewrite_links( html, lambda lnk: LANGUAGE_LINK_RE.sub(u'/' + language_code + u'/', lnk) ) return mark_safe(html)
def custom_template_preview_render(): body, _ = EmailTemplate.make_sample( from_name=request.args.get('from_name'), subject=request.args.get('subject'), style=request.args.get('style'), body=request.args.get('body'), ) return rewrite_links(body, lambda x: "#" + x)
def viewmail_html(self, content, **kwargs): import lxml.html if content is None or content == "": return "" links = kwargs.get("links", 0) html = lxml.html.fromstring(content) if not links: html.rewrite_links(lambda x: None) else: html.rewrite_links(self.map_cid) body = html.find("body") if body is None: body = lxml.html.tostring(html) else: body = lxml.html.tostring(body) body = re.sub("<(/?)body", lambda m: "<%sdiv" % m.group(1), body) return body
def custom_template_preview_render(): data = request.form body, _ = EmailTemplate.make_preview( from_name=data.get("from_name"), subject=data.get("subject"), style=data.get("style"), body=data.get("body"), ) return rewrite_links(body, lambda x: "#" + x)
def _post_process_html(self): html = lxml.html.fromstring(self.contents["html"]) if self.links: html.rewrite_links(self._map_cid) for link in html.iterlinks(): link[0].set("target", "_blank") else: html.rewrite_links(lambda x: None) cleaner = Cleaner(scripts=True, javascript=True, links=True, page_structure=True, embedded=True, frames=True, add_nofollow=True) mail_text = lxml.html.tostring(cleaner.clean_html(html)) self.contents["html"] = smart_text(mail_text)
def adapt_content(content, server_name): decoded = unicode(content, 'UTF-8') html = lxml.html.fromstring(decoded) del content del decoded catID = extract_catID(html) titles = extract_titles(html) if catID and titles: rel_searches = perform_rel_search(catID, titles) product_details = html.xpath('//li[@class = "rev"]/div[@class = "product_details_content"]/..') for pos, element in enumerate(product_details): rel_search_phrases = rel_search_path(rel_searches, id=pos) if rel_search_phrases: rel_search_div = lxml.html.Element('div') rel_search_div.attrib['style'] = 'margin-right: 50px; background-color: rgb(100, 230, 50); font-size: 16px; z-index: 100' for phrase in rel_search_phrases: search_link = lxml.html.Element('a') href = '/classify?search_box=1&cat_id=%s&sfsk=0&keyword=%s' %(catID, quote_plus(phrase)) search_link.attrib['href'] = href search_link.attrib['style'] = 'z-index: 100' search_link.text = phrase rel_search_div.append(search_link) spacer = lxml.html.Element('span') spacer.text = ' ' rel_search_div.append(spacer) # We could find the element properly, but I know its at 5 and this is a hack :S element.insert(5, rel_search_div) def rewrite_link(link): if link.startswith('http://www.bizrate.co.uk'): return link[len('http://www.bizrate.co.uk'):] else: return link html.rewrite_links(rewrite_link) return lxml.html.tostring(html, encoding="UTF-8")
def _post_process_html(self, content): html = lxml.html.fromstring(content) if self.links: html.rewrite_links(self._map_cid) for link in html.iterlinks(): link[0].set("target", "_blank") else: html.rewrite_links(lambda x: None) cleaner = Cleaner( scripts=True, javascript=True, links=True, page_structure=True, embedded=True, frames=True, add_nofollow=True) mail_text = lxml.html.tostring(cleaner.clean_html(html)) return smart_text(mail_text)
def get_html_base(self): """ Gets the HTML associated with the current child task Input: None Output: Child task HTML """ self.update_task_states() html = self.current_task.get_html(self.system) return_html = html try: #Without try except block, get this error: # File "/home/vik/mitx_all/mitx/common/lib/xmodule/xmodule/x_module.py", line 263, in rewrite_content_links # if link.startswith(XASSET_SRCREF_PREFIX): # Placing try except so that if the error is fixed, this code will start working again. return_html = rewrite_links(html, self.rewrite_content_links) except Exception: pass return return_html
def apply_markup_filter(text): """Applies a text-to-HTML conversion function to a piece of text and returns the generated HTML. The function to use is derived from the value of the setting ``MARKUP_FILTER``, which should be a 2-tuple: * The first element should be the name of a markup filter -- e.g., "markdown" -- to apply. If no markup filter is desired, set this to None. * The second element should be a dictionary of keyword arguments which will be passed to the markup function. If no extra arguments are desired, set this to an empty dictionary; some arguments may still be inferred as needed, however. So, for example, to use Markdown with safe mode turned on (safe mode removes raw HTML), put this in your settings file:: MARKUP_FILTER = ('markdown', { 'safe_mode': 'escape' }) Currently supports Textile, Markdown and reStructuredText, using names identical to the template filters found in ``django.contrib.markup``. Borrowed from http://djangosnippets.org/snippets/104/ """ markup_filter_name, markup_kwargs = get_markup_filter() if not text.strip(): return text html = text if markup_filter_name is not None: if markup_filter_name == 'textile': import textile if 'encoding' not in markup_kwargs: markup_kwargs.update(encoding=settings.DEFAULT_CHARSET) if 'output' not in markup_kwargs: markup_kwargs.update(output=settings.DEFAULT_CHARSET) html = textile.textile(text, **markup_kwargs) elif markup_filter_name == 'markdown': import markdown html = markdown.markdown(text, **markup_kwargs) elif markup_filter_name == 'restructuredtext': from docutils import core if 'settings_overrides' not in markup_kwargs: markup_kwargs.update( settings_overrides=getattr( settings, "RESTRUCTUREDTEXT_FILTER_SETTINGS", {}, ) ) if 'writer_name' not in markup_kwargs: markup_kwargs.update(writer_name='html4css1') parts = core.publish_parts(source=text, **markup_kwargs) html = parts['html_body'] return rewrite_links(html, rewrite_internal_link)
def all_req(environ, start_response): path_url = environ['PATH_INFO'] assert path_url.startswith("/") path_url = path_url[1:] method = environ.get('REQUEST_METHOD').upper() if not (path_url.startswith(u"http://") or path_url.startswith(u"https://")): path_url = u"http://" + unicode(path_url) if path_url != u'http://favicon.ico': setattr(all_req, LAST_REQ_BASE_URL, get_base_url(path_url)) else: path_url = getattr(all_req, LAST_REQ_BASE_URL, "") + "/favicon.ico" req_query_string = environ.get("QUERY_STRING", "") try: # 获取data req_data = environ['wsgi.input'].read( int(environ.get('CONTENT_LENGTH', '0'))) except: req_data = None requestpool_headers = {} req_headers = {} for key, val in environ.iteritems(): if key.startswith('HTTP_'): # 生成req_headers 暂无需求 header_name = key[5:].replace('_', '-') if header_name == 'host'.upper(): continue # 禁用缓存 if "CACHE-CONTROL" in header_name: continue elif "IF-MODIFIED-SINCE" in header_name: continue # 禁用复用 if "CONNECTION" in header_name: continue if "CACHE-CONTROL" in header_name: continue if 'REQUESTSPOOL.' in header_name: requestpool_headers[header_name] = val else: req_headers[header_name] = val status_code, headers, output = get_http_result( url=path_url, method=method, req_query_string=req_query_string, req_data=req_data, req_headers=req_headers) if "content-type" in headers and u'text/html' in headers.get( "content-type"): html = lxml.html.fromstring(output) html.rewrite_links(RewriteLink(get_base_url(path_url))) output = lxml.html.tostring(html) start_response( "{0} {1}".format(status_code, responses.get(status_code, 'OK')), headers.items()) return (output, )
def get_load_page(self, request_info, url, **kwargs): """ Proxy a web-page through so that a UI can be displayed for showing potential results. """ web_client = None try: # -------------------------------------- # 1: Make sure that user has permission to make inputs. We don't want to allow people # to use this as a general proxy. # -------------------------------------- if not WebInputOperationsHandler.hasCapability( 'edit_modinput_web_input' ) and WebInputOperationsHandler.hasCapability('admin_all_objects'): return self.render_error_html( 'You need the "edit_modinput_web_input" capability ' + 'to make website inputs', 403) # Don't allow proxying of the javascript files if url.endswith(".js"): return { 'payload': '', 'status': 200, 'headers': { 'Content-Type': 'application/javascript' }, } # -------------------------------------- # 2: Only allow HTTPS if the install is on Splunk Cloud # -------------------------------------- if ModularInput.is_on_cloud(request_info.session_key ) and not url.startswith("https://"): return self.render_error_html( 'URLs on Splunk Cloud must use HTTPS protocol', 401) # TODO: deterine best code # -------------------------------------- # 3: Perform a request for the page # -------------------------------------- # Get the proxy configuration conf_stanza = "default" try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \ web_input.get_proxy_config(request_info.session_key, conf_stanza) except ResourceNotFound: return self.render_error_html( "Proxy server information could not be obtained", 202) # Get the timeout to use timeout = None if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except ValueError: timeout = 15 else: timeout = 15 # Get the user-agent user_agent = kwargs.get('user_agent', None) # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Make the client if browser is None or browser == WebScraper.INTEGRATED_CLIENT: web_client = DefaultWebClient(timeout, user_agent, logger) elif browser == WebScraper.FIREFOX: web_client = FirefoxClient(timeout, user_agent, logger) elif browser == WebScraper.CHROME: web_client = ChromeClient(timeout, user_agent, logger) web_client.setProxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) # Get the username and password username = kwargs.get('username', None) password = kwargs.get('password', None) username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if username is not None and password is not None: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) web_client.setCredentials(username, password) if authentication_url is not None: logger.debug( "Authenticating using form login in scrape_page") web_client.doFormLogin(authentication_url, username_field, password_field) # Get the page try: content = web_client.get_url(url, 'GET') response = web_client.get_response_headers() except: logger.exception( "Exception generated while attempting to content for url=%s", url) return self.render_error_html( "Page preview could not be obtained using a web-browser", 500) # -------------------------------------- # 4: Render the content with the browser if necessary # -------------------------------------- """ if 'text/html' in response['content-type']: # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Try rendering the content using a web-browser try: if browser is not None and browser != WebScraper.INTEGRATED_CLIENT: web_scraper = WebScraper(timeout=timeout) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) web_scraper.set_authentication(username, password) content = web_scraper.get_result_browser(urlparse(url), browser) except: logger.exception("Exception generated while attempting to get browser rendering or url=%s", url) cherrypy.response.status = 500 return self.render_error_html("Page preview could not be obtained using a web-browser") """ # -------------------------------------- # 5: Rewrite the links in HTML files so that they also point to the internal proxy # -------------------------------------- if "<html" in content: # Parse the content html = lxml.html.document_fromstring(content) # Rewrite the links to point to this internal proxy rewrite_using_internal_proxy = True if rewrite_using_internal_proxy: def relocate_href(link): """ Change the hrefs such that they go through the proxy. """ link = urljoin(url, link) if link.endswith(".js"): return "" if not link.endswith(".css"): return "load_page?url=" + link else: return link html.rewrite_links(relocate_href) # Block the href links for element, attribute, _, _ in html.iterlinks(): if element.tag == "a" and attribute == "href": element.set('href', "#") elif element.tag == "form" and attribute == "action": element.set('action', "?") else: html.make_links_absolute(url) # Determine if we should clean the JS clean_script = True if 'clean_script' in kwargs: clean_script = util.normalizeBoolean( kwargs['clean_script']) # Determine if we should clean the CSS clean_styles = False if 'clean_styles' in kwargs: clean_styles = util.normalizeBoolean( kwargs['clean_styles']) # Clean up the HTML if clean_styles or clean_script: kill_tags = [] if clean_script: kill_tags = ["script"] # Remove the script blocks cleaner = Cleaner(page_structure=False, kill_tags=kill_tags, javascript=False, links=False, style=clean_styles, safe_attrs_only=False) # Get the content content = lxml.html.tostring(cleaner.clean_html(html), encoding="unicode") else: content = lxml.html.tostring(html, encoding="unicode") # -------------------------------------- # 6: Respond with the results # -------------------------------------- headers = {} if 'content-type' in response: headers['Content-Type'] = response['content-type'] else: headers['Content-Type'] = 'text/html' # -------------------------------------- # 7: Clear Javascript files # -------------------------------------- if response.get('content-type', "") == "application/javascript" \ or response.get('content-type', "") == "application/x-javascript" \ or response.get('content-type', "") == "text/javascript" \ or url.endswith(".js"): return {'payload': '', 'headers': headers, 'status': 200} return {'payload': content, 'headers': headers, 'status': 200} except LoginFormNotFound: logger.debug("Login form not found") return self.render_error_html("Login form was not found", 200) except FormAuthenticationFailed as e: logger.debug("Form authentication failed: " + str(e)) return self.render_error_html( "Form authentication failed: " + str(e), 200) except: logger.exception("Error when attempting to proxy an HTTP request") return self.render_error_html("Page preview could not be created", 500) finally: if web_client: web_client.close()
def apply_markup_filter(text): """Applies a text-to-HTML conversion function to a piece of text and returns the generated HTML. The function to use is derived from the value of the setting ``MARKUP_FILTER``, which should be a 2-tuple: * The first element should be the name of a markup filter -- e.g., "markdown" -- to apply. If no markup filter is desired, set this to None. * The second element should be a dictionary of keyword arguments which will be passed to the markup function. If no extra arguments are desired, set this to an empty dictionary; some arguments may still be inferred as needed, however. So, for example, to use Markdown with safe mode turned on (safe mode removes raw HTML), put this in your settings file:: MARKUP_FILTER = ('markdown', { 'safe_mode': 'escape' }) Currently supports Textile, Markdown and reStructuredText, using names identical to the template filters found in ``django.contrib.markup``. Borrowed from http://djangosnippets.org/snippets/104/ """ markup_filter_name, markup_kwargs = get_markup_filter() if not text.strip(): return text html = text if markup_filter_name is not None: if markup_filter_name == 'textile': import textile if 'encoding' not in markup_kwargs: markup_kwargs.update(encoding=settings.DEFAULT_CHARSET) if 'output' not in markup_kwargs: markup_kwargs.update(output=settings.DEFAULT_CHARSET) html = textile.textile(text, **markup_kwargs) elif markup_filter_name == 'markdown': import markdown html = markdown.markdown(text, **markup_kwargs) elif markup_filter_name == 'restructuredtext': from docutils import core if 'settings_overrides' not in markup_kwargs: markup_kwargs.update(settings_overrides=getattr( settings, "RESTRUCTUREDTEXT_FILTER_SETTINGS", {}, )) if 'writer_name' not in markup_kwargs: markup_kwargs.update(writer_name='html4css1') parts = core.publish_parts(source=text, **markup_kwargs) html = parts['html_body'] return rewrite_links(html, rewrite_internal_link)
newlink = '%s%s' % (relpath, link) else: newlink = link if args.vverbose: print '(abs2rel) old link: %s' % link print '(abs2rel) new link: %s' % newlink print return newlink if args.verbose: print 'Replacing absolute links with relative links' for root, dirs, files in os.walk(args.path): for file in files: if file.find(args.suffix) != -1: page = open(os.path.join(root, file)).read() if args.verbose: print 'file: %s/%s' % (root, file) html = lxml.html.fromstring(page) html.rewrite_links(abs2rel) # Write the updated links back to the file with open(os.path.join(root, file), 'w') as f: f.write(lxml.html.tostring(html))
def handleAdd(self, action): data, errors = self.extractData() if errors: self.status = self.formErrorsMessage return # 1. Find or create folder where selected items will be archived. folder_id = "c" folder_title = "{} Converted".format(self.context.Title()) folder = self.context.get(folder_id) if folder is None: folder = self.context[self.context.invokeFactory( 'Folder', folder_id, title=folder_title)] # 2. Find original objects. adapter = IAdapter(self.context) paths = self.paths.split( '\r\n') if '\r\n' in self.paths else self.paths.split('\n') objs = self.context.getObjectsFromPathList(paths) form = self.request.form # Data from form title = form.get('form.widgets.IBasic.title') description = form.get('form.widgets.IBasic.description') fpaivays = None if form.get('form.widgets.paivays-year') and form.get( 'form.widgets.paivays-day'): fpaivays = datetime(int(form.get('form.widgets.paivays-year')), int(form.get('form.widgets.paivays-month')), int(form.get('form.widgets.paivays-day'))) text = form.get('form.widgets.text') omits = [ 'form.widgets.IBasic.title', 'form.widgets.IBasic.description', 'form.widgets.paivays-year', 'form.widgets.paivays-month', 'form.widgets.paivays-day', 'form.widgets.text', 'form.widgets.text.mimeType', 'form.buttons.convert', 'form.widgets.paivays-empty-marker', 'form.widgets.paivays-calendar', 'form.widgets.IVersionable.changeNote', 'form.widgets.paths' ] # file file_field = self.file_field() fname = 'form.widgets.{}'.format(file_field.getName()) cfile = form.get(fname) omits.append(fname) if cfile: cfile.seek(0) file_data = cfile.read() file_name = cfile.filename cfile.close() # image image_field = self.image_field() iname = 'form.widgets.{}'.format(image_field.getName()) cimage = form.get(iname) omits.append(iname) if cimage: cimage.seek(0) image_data = cimage.read() image_name = cimage.filename cimage.close() keys = [ key for key in form.keys() if key not in omits and key.startswith('form.widgets.') and not key.endswith('empty-marker') ] data = {} for key in keys: val = form.get(key) if val: if isinstance(val, list): val = [va.decode('unicode_escape') for va in val] data[key.split('.')[2]] = val object_ids = [] # 3. Select values and create archive. for obj in objs: data = data.copy() data['title'] = safe_unicode(obj.Title()) or title data['description'] = safe_unicode( obj.Description()) or description uuid = obj.UID() brain = adapter.get_brain(UID=uuid) paivays = fpaivays if brain.review_state == 'published': paivays = brain.effective if paivays is None: paivays = brain.created if not isinstance(paivays, datetime): paivays = paivays.asdatetime().replace(tzinfo=None) data['paivays'] = paivays if obj.getField('text') is not None: text = self._strip_dev( rewrite_links(safe_unicode(obj.getField('text').get(obj)), link_repl_func)) or text if text: data['text'] = text content = createContentInContainer(folder, 'archive', checkConstraints=False, **data) # file filedata = None contentType = '' ofile = obj.getField('file', obj) if ofile: file_obj = ofile.get(obj) if file_obj and file_obj.get_size(): filedata = file_obj.data filename = file_obj.filename or data['title'] contentType = file_obj.getContentType() if filedata is None and cfile: filedata = file_data filename = file_name if filedata is not None: setattr( content, file_field.getName(), NamedBlobFile(data=filedata, filename=safe_unicode(filename), contentType=contentType)) # image imagedata = None contentType = '' oimage = obj.getField('image', obj) if oimage: image_obj = oimage.get(obj) if image_obj and image_obj.get_size(): imagedata = image_obj.data if not isinstance( image_obj.data, Pdata) else image_obj.data.data imagename = safe_unicode( image_obj.filename) or data['title'] contentType = image_obj.getContentType() if imagedata is None and cimage: imagedata = image_data imagename = image_name if imagedata is not None: setattr( content, image_field.getName(), NamedBlobImage(data=imagedata, filename=safe_unicode(imagename), contentType=contentType)) alsoProvides(content, IArchive) modified(content) object_ids.append(obj.id) # Remove the original object self.context.manage_delObjects(object_ids) message = _( u"add_converted_archives_success", default= u"${number} converted archive(s) are added to folder: ${title}", mapping={ 'number': len(objs), 'title': safe_unicode(folder_title) }) IStatusMessage(self.request).addStatusMessage(message, type='info') url = '{}/folder_contents'.format(self.context.absolute_url()) return self.request.response.redirect(url)
relpath = os.path.relpath(args.path, root) newlink = '%s%s' % (relpath, link) else: newlink = link if args.vverbose: print '(abs2rel) old link: %s' % link print '(abs2rel) new link: %s' % newlink print return newlink if args.verbose: print 'Replacing absolute links with relative links' for root, dirs, files in os.walk(args.path): for file in files: if file.find(args.suffix) != -1: page = open(os.path.join(root, file)).read() if args.verbose: print 'file: %s/%s' % (root, file) html = lxml.html.fromstring(page) html.rewrite_links(abs2rel) # Write the updated links back to the file with open(os.path.join(root, file), 'w') as f: f.write(lxml.html.tostring(html))
def load_page(self, url, **kwargs): """ Proxy a web-page through so that a UI can be displayed for showing potential results. """ web_client = None try: # -------------------------------------- # 1: Make sure that user has permission to make inputs. We don't want to allow people # to use this as a general proxy. # -------------------------------------- if not WebInputController.hasCapability('edit_modinput_web_input'): return self.render_error_html('You need the "edit_modinput_web_input" capability ' + 'to make website inputs') # Don't allow proxying of the javascript files if url.endswith(".js"): cherrypy.response.headers['Content-Type'] = 'application/javascript' return "" # -------------------------------------- # 2: Only allow HTTPS if the install is on Splunk Cloud # -------------------------------------- if ModularInput.is_on_cloud(cherrypy.session.get('sessionKey')) and not url.startswith("https://"): return self.render_error_html('URLs on Splunk Cloud must use HTTPS protocol') # -------------------------------------- # 3: Perform a request for the page # -------------------------------------- # Get the proxy configuration conf_stanza = "default" try: web_input = WebInput(timeout=10) proxy_type, proxy_server, proxy_port, proxy_user, proxy_password = \ web_input.get_proxy_config(cherrypy.session.get('sessionKey'), conf_stanza) except splunk.ResourceNotFound: cherrypy.response.status = 202 return self.render_error_html("Proxy server information could not be obtained") # Get the timeout to use timeout = None if 'timeout' in kwargs: try: timeout = int(kwargs['timeout']) except ValueError: timeout = 15 else: timeout = 15 # Get the user-agent user_agent = kwargs.get('user_agent', None) # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Make the client if browser is None or browser == WebScraper.INTEGRATED_CLIENT: web_client = DefaultWebClient(timeout, user_agent, logger) elif browser == WebScraper.FIREFOX: web_client = FirefoxClient(timeout, user_agent, logger) elif browser == WebScraper.CHROME: web_client = ChromeClient(timeout, user_agent, logger) web_client.setProxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) # Get the username and password username = kwargs.get('username', None) password = kwargs.get('password', None) username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) if username is not None and password is not None: username = kwargs['username'] password = kwargs['password'] username_field = kwargs.get('username_field', None) password_field = kwargs.get('password_field', None) authentication_url = kwargs.get('authentication_url', None) web_client.setCredentials(username, password) if authentication_url is not None: logger.debug("Authenticating using form login in scrape_page") web_client.doFormLogin(authentication_url, username_field, password_field) # Get the page try: content = web_client.get_url(url, 'GET') response = web_client.get_response_headers() except: logger.exception("Exception generated while attempting to content for url=%s", url) cherrypy.response.status = 500 return self.render_error_html("Page preview could not be created using a web-browser") # -------------------------------------- # 4: Render the content with the browser if necessary # -------------------------------------- """ if 'text/html' in response['content-type']: # Get the information on the browser to use browser = None if 'browser' in kwargs: browser = kwargs['browser'] # Try rendering the content using a web-browser try: if browser is not None and browser != WebScraper.INTEGRATED_CLIENT: web_scraper = WebScraper(timeout=timeout) web_scraper.set_proxy(proxy_type, proxy_server, proxy_port, proxy_user, proxy_password) web_scraper.set_authentication(username, password) content = web_scraper.get_result_browser(urlparse.urlparse(url), browser) except: logger.exception("Exception generated while attempting to get browser rendering or url=%s", url) cherrypy.response.status = 500 return self.render_error_html("Page preview could not be created using a web-browser") """ # -------------------------------------- # 5: Rewrite the links in HTML files so that they also point to the internal proxy # -------------------------------------- if "<html" in content: # Parse the content html = lxml.html.document_fromstring(content) # Rewrite the links to point to this internal proxy rewrite_using_internal_proxy = True if rewrite_using_internal_proxy: def relocate_href(link): """ Change the hrefs such that they go through the proxy. """ link = urlparse.urljoin(url, link) if link.endswith(".js"): return "" if not link.endswith(".css"): return "load_page?url=" + link else: return link html.rewrite_links(relocate_href) # Block the href links for element, attribute, _, _ in html.iterlinks(): if element.tag == "a" and attribute == "href": element.set('href', "#") elif element.tag == "form" and attribute == "action": element.set('action', "?") else: html.make_links_absolute(url) # Determine if we should clean the JS clean_script = True if 'clean_script' in kwargs: clean_script = util.normalizeBoolean(kwargs['clean_script']) # Determine if we should clean the CSS clean_styles = False if 'clean_styles' in kwargs: clean_styles = util.normalizeBoolean(kwargs['clean_styles']) # Clean up the HTML if clean_styles or clean_script: kill_tags = [] if clean_script: kill_tags = ["script"] # Remove the script blocks cleaner = Cleaner(page_structure=False, kill_tags=kill_tags, javascript=False, links=False, style=clean_styles, safe_attrs_only=False) # Get the content content = lxml.html.tostring(cleaner.clean_html(html)) else: content = lxml.html.tostring(html) # -------------------------------------- # 6: Respond with the results # -------------------------------------- if 'content-type' in response: cherrypy.response.headers['Content-Type'] = response['content-type'] else: cherrypy.response.headers['Content-Type'] = 'text/html' # -------------------------------------- # 7: Clear Javascript files # -------------------------------------- if response.get('content-type', "") == "application/javascript" \ or response.get('content-type', "") == "application/x-javascript" \ or response.get('content-type', "") == "text/javascript" \ or url.endswith(".js"): return "" return content except LoginFormNotFound: logger.debug("Login form not found") return self.render_error_html("Login form was not found") except FormAuthenticationFailed as e: logger.debug("Form authentication failed: " + str(e)) return self.render_error_html("Form authentication failed: " + str(e)) except: logger.exception("Error when attempting to proxy an HTTP request") cherrypy.response.status = 500 return self.render_error_html("Page preview could not be created") finally: if web_client: web_client.close()
def scrub(site): """ Given root, find content with HTML body, look for bad links or other errors. """ searcher = ICatalogSearch(site) total, docids, resolver = searcher(interfaces=[IContent], ) log.info("Found a total of %d documents", total) for docid in docids: doc = resolver(docid) if not hasattr(doc, 'text'): continue path = model_path(doc) log.debug("Checking %s", path) text = doc.text if not text: # Some types we're expecting not to have text, so don't warn # about those if not (ICommunity.providedBy(doc) or ICalendarEvent.providedBy(doc)): log.warn("No text: %s %s", type(doc), path) continue try: try: # Will throw ParserError if fragment doesn't have a single # root element. html = fragment_fromstring(doc.text) except ParserError: # Wrap in a single div to make the parser happy html = fragment_fromstring('<div>%s</div>' % doc.text) except XMLSyntaxError: log.error("Unparseable: %s", path, exc_info=True) # Check and fix links def callback(link): fixed = _rewrite_link(site, path, link) if fixed != link: log.info("Link rewritten at %s", path) log.info("Old link: %s", link) log.info("New link: %s", fixed) if not isinstance(fixed, unicode): fixed = unicode(fixed, 'utf-8') return fixed html.rewrite_links(callback) # Need to also change any instances of the 'mce_href' attribute to # match newly rewritten 'href' attribute. for element in html.getiterator(): if 'mce_href' in element.keys(): element.set('mce_href', element.get('href')) doc.text = unicode(lxml.html.tostring(html, 'utf-8'), 'utf-8') log.info("Done.") log.info("Unknown schemes: %s", ', '.join(unknown_schemes))