def build_newone(self, table): # html = E.HTML(E.HEAD(),E.BODY(table)) head = """<html><body>""" foot = """</body></html>""" html = " ".join([tostring(i) for i in table]) page = fromstring(head + html + foot) open_in_browser(page)
def render_in_browser(self, **kwargs): """Render the graph, open it in your browser with black magic""" try: from lxml.html import open_in_browser except ImportError: raise ImportError('You must install lxml to use render in browser') open_in_browser(self.render_tree(**kwargs), encoding='utf-8')
def render_in_browser(self): """Render the graph, open it in your browser with black magic""" try: from lxml.html import open_in_browser except ImportError: raise ImportError('You must install lxml to use render in browser') open_in_browser(self.render_tree(), encoding='utf-8')
def open_in_browser(response): """Provide a simple interface for `lxml.html.open_in_browser` function. Be careful, use this function only for debug purpose :param response: :return: """ lh.open_in_browser(response.html)
def render_in_browser(self, **kwargs): """Render the graph, open it in your browser with black magic""" try: from lxml.html import open_in_browser except ImportError: raise ImportError('You must install lxml to use render in browser') kwargs.setdefault('force_uri_protocol', 'https') open_in_browser(self.render_tree(**kwargs), encoding='utf-8')
def render_in_browser(self, **kwargs): """Render the graph, open it in your browser with black magic""" try: from lxml.html import open_in_browser except ImportError: raise ImportError("You must install lxml to use render in browser") kwargs.setdefault("force_uri_protocol", "https") open_in_browser(self.render_tree(**kwargs), encoding="utf-8")
def open_in_browser(self): if sys.platform == "win32": try: td_td_html = self.to_html() if not td_td_html: return table = """<table border="1" cellspacing="0">{}</table>""".format( td_td_html) open_in_browser(etree.fromstring(table)) except: print("open_in_browser ERROR!") else: print("open in browser success!") else: print("{}不支持此功能".format(sys.platform))
def getSublists(href, hinweis): try: html = scraperwiki.scrape(href) root = lxml.html.fromstring(html) root.make_links_absolute("http://www.spar.at/") open_in_browser(root) for sublist in root.cssselect("div.left-nav li.on"): try: for element, attribute, link, pos in sublist.iterlinks(): #url = sublist.cssselect("li a")[0].attrib['href'] print "Sublist " + link scrapeData(link, '"' + hinweis + '"') except IndexError: print "no href" except ValueError: print "no href" except ValueError: print "no href"
def getSublists(href, hinweis): try: html = scraperwiki.scrape(href) root = lxml.html.fromstring(html) root.make_links_absolute("http://www.spar.at/") open_in_browser(root) for sublist in root.cssselect("div.left-nav li.on"): try: for element, attribute, link, pos in sublist.iterlinks(): #url = sublist.cssselect("li a")[0].attrib['href'] print "Sublist "+link scrapeData(link,'"'+hinweis+'"') except IndexError: print "no href" except ValueError: print "no href" except ValueError: print "no href"
import urllib from lxml import html url='http://dict.youdao.com/search?q='+'dictionary' page = html.fromstring(urllib.urlopen(url).read()) collins = page.xpath('//*[@id="collinsResult"]/div/div/div/div')[0] print "collins result: ", html.tostring(collins) #print "type of collins", type(collins) #print "type of page", type(page) html.open_in_browser(collins)
def open_in_browser(tree, encoding = 'utf-8'): ''' Opens a LXML tree in a browser. ''' from lxml.html import open_in_browser open_in_browser(tree, encoding)
from lxml import html events_html = html.fromstring(events0.text) # - # ### Using xpath to extract content from HTML # `XPath` is a tool for identifying particular elements withing a HTML # document. The developer tools built into modern web browsers make it # easy to generate `XPath`s that can used to identify the elements of a # web page that we wish to extract. # # We can open the html document we retrieved and inspect it using # our web browser. # + {"results": "'hide'"} html.open_in_browser(events_html, encoding = 'UTF-8') # - # ![](img/dev_tools_right_click.png) # # ![](img/dev_tools_inspect.png) # # Once we identify the element containing the information of interest we # can use our web browser to copy the `XPath` that uniquely identifies # that element. # # ![](img/dev_tools_xpath.png) # # Next we can use python to extract the element of interest: events_list_html = events_html.xpath('//*[@id="events_list"]')[0]
from lxml.html.diff import htmldiff from lxml.html import parse, tostring, open_in_browser, fromstring def get_page(url): doc = parse(url).getroot() doc.make_links_absolute() return tostring(doc) def compare_pages(url1, url2, selector='body div'): basis = parse(url1).getroot() basis.make_links_absolute() other = parse(url2).getroot() other.make_links_absolute() el1 = basis.cssselect(selector)[0] el2 = other.cssselect(selector)[0] diff_content = htmldiff(tostring(el1), tostring(el2)) diff_el = fromstring(diff_content) el1.getparent().insert(el1.getparent().index(el1), diff_el) el1.getparent().remove(el1) return basis if __name__ == '__main__': import sys doc = compare_pages(sys.argv[1], sys.argv[2], sys.argv[3]) open_in_browser(doc)
#!/usr/bin/env python3 # Copyright (c) 2012 Домоглед <*****@*****.**> # @author Петр Болф <*****@*****.**> ''' Hen je program, který ... ''' #v systému import webbrowser webbrowser.open(url) #nebo v lxml, který to výše uvedené používá from lxml.html import open_in_browser open_in_browser(element)
# here we setup the necessary agent to download a google html page opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64) \ AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/39.0.2171.95 Safari/537.36 \ OPR/26.0.1656.60')] # let's download google_html = opener.open(URL) # parse the html google_parsed = html.parse(google_html) # Here's a smarter way to see what exactly it is you've downloaded/parsed with lxml: html.open_in_browser(google_parsed) #file://c:/users/rodrigo/appdata/local/temp/tmp1xllau.html # Here comes the 'selecting'! google_results = google_parsed.xpath('//*[@id="rso"]/div[2]') print len(google_results) #1 # the xpath in this line basically selects all children, which in our # case are the 10 'li' elements print len(google_results[0].xpath('./*')) #10 # print out hyperlinks # Note: after using devtool's magnifying glass and 'copy xpath', I got:
def view(response=None): if response is None: global data response = data["response"] open_in_browser(HtmlParser(response), response.encoding)
def render_in_browser(self): """Render the graph, open it in your browser with black magic""" from lxml.html import open_in_browser open_in_browser(self.render_tree(), encoding='utf-8')
# # While JSON parsing is built into the Python `requests` library, parsing HTML requires a separate library. I recommend using the HTML parser from the `lxml` library; others prefer an alternative called `beautifulsoup4`. from lxml import html # convert a html text representation (`events.text`) into # a tree-structure (DOM) html representation (`events_html`) events_html = html.fromstring(events.text) # ### Using XPath to extract content from HTML # # `XPath` is a tool for identifying particular elements within a HTML document. The developer tools built into modern web browsers make it easy to generate `XPath`s that can be used to identify the elements of a web page that we wish to extract. # # We can open the HTML document we retrieved and inspect it using our web browser. html.open_in_browser(events_html, encoding = 'UTF-8') # ![](Python/PythonWebScrape/images/dev_tools_right_click.png) # # ![](Python/PythonWebScrape/images/dev_tools_inspect.png) # # Once we identify the element containing the information of interest we can use our web browser to copy the `XPath` that uniquely identifies that element. # # ![](Python/PythonWebScrape/images/dev_tools_xpath.png) # # Next we can use Python to extract the element of interest: events_list_html = events_html.xpath('//*[@id="events_list"]/article') # Let's just extract the second element in our events list.
def compare_in_differnt_browser(comparedpage): from lxml.html import parse, open_in_browser addr = addressbar.get_text() url = parse(addr).getroot() url.make_links_absolute() open_in_browser(url)
def open_in_browser(self, encoding='utf-8'): html.open_in_browser(self.root, encoding)
def open_in_browser(self): open_in_browser(self.tree)
def open_in_browser(tree, encoding='utf-8'): ''' Opens a LXML tree in a browser. ''' from lxml.html import open_in_browser open_in_browser(tree, encoding)
def show(self): open_in_browser(self.source)
#!/usr/bin/env python3 # Copyright (c) 2012 Домоглед <*****@*****.**> # @author Петр Болф <*****@*****.**> """ Hen je program, který ... """ # v systému import webbrowser webbrowser.open(url) # nebo v lxml, který to výše uvedené používá from lxml.html import open_in_browser open_in_browser(element)
def get_captcha(captcha_src): html.open_in_browser(html.fromstring(f"<img src='{captcha_src}' />")) return input("Enter the captcha displayed in browser: ")