def test_to_doc(): #doc = lx.to_doc(ex.HTML_1) # could be this too doc = lx.to_doc(ex.HTML_1, scraper.LXML_HTML) # this parser is the default assert isinstance(doc, lxml.html.HtmlElement) # doc = lx.to_doc(ex.HTML_1, scraper.HTML5PARSER) assert isinstance(doc, lxml.html.HtmlElement) # doc = lx.to_doc(ex.HTML_1, scraper.BEAUTIFULSOUP) assert isinstance(doc, lxml.html.HtmlElement) # doc = lx.to_doc(ex.HTML_1, parser=None) assert doc is None # # now let's see with HTML fragments # doc = lx.to_doc(ex.FRAGMENT, scraper.LXML_HTML, whole_doc=False) assert isinstance(doc, lxml.html.HtmlElement) # doc = lx.to_doc(ex.FRAGMENT, scraper.HTML5PARSER, whole_doc=False) assert isinstance(doc, lxml.html.HtmlElement) # doc = lx.to_doc(ex.FRAGMENT, scraper.BEAUTIFULSOUP, whole_doc=False) assert isinstance(doc, lxml.html.HtmlElement) # doc = lx.to_doc(ex.FRAGMENT, parser=None, whole_doc=False) assert doc is None
def get_image_url_list(url): """Controller function for getting the URLs of the JPG images.""" text = get_page(url) doc = lx.to_doc(text) subpages = get_subpages(doc) images = extract_images_from_pages(subpages) return images
def process(word): """Process the given word. The return value is a tuple: (word, hyphenation, pronunciation mp3).""" url = _template.format(word=word) html = web.get_page(url, user_agent=True) doc = lx.to_doc(html) return (word, get_hyphen(doc), get_mp3(doc))
def extract_images_from_pages(pages): """Extract images from subpages.""" li = [] for page in pages: doc = lx.to_doc(get_page(page)) image = get_jpg_image(doc) li.append(image) return [x for x in li if x] # remove None elems
def demo6(): text = """<ul> <li>abc</li> <li>def <li>ghi</li> </ul>""" doc = lx.to_doc(text) for li in doc.cssselect("ul li"): print li.text.strip()
def process(url): text = get_page(url, user_agent=True) doc = lx.to_doc(text) #lx.show_paths(doc, find='Montreal, Quebec') tag = doc.cssselect('h1#locationName.brTopLeft5')[0] city = tag.text print city tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0] celsius = tag.text_content() print celsius
def demo8(): url = "http://python.org/" text = get_page(url) # doc = lx.to_doc(text, parser=scraper.HTML5PARSER) # doc = lx.to_doc(text) doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP) # print type(doc) # print etree.tostring(doc) title = doc.cssselect("html head title")[0] print title.text
def demo5(): text = """ <html> <table> <tr><td>http://google.ca</td></tr> <tr><td>http://reddit.com</td></tr> </table> </html> """ doc = lx.to_doc(text) lx.show_paths(doc)
def demo4(): text = """ <html> <table> <tr><td>http://google.ca</td></tr> <tr><td>http://reddit.com</td></tr> </table> </html> """ doc = lx.to_doc(text) doc = lx.autolink(doc) print lx.prettify(doc)
def demo1(): text = """ <html> <table> <tr><td>Header</td></tr> <tr><td>Want This</td></tr> </table> <a href="http://google.ca">Google.ca</a> </html> """ doc = lx.to_doc(text) row1 = doc.cssselect("table")[0] print row1.cssselect("tr td")[0].text print doc.cssselect("a[href]")[0].get("href")
def test_prettify(): doc = lx.to_doc(ex.UGLY, parser=scraper.LXML_HTML) # nice = lx.prettify(doc, method=scraper.LXML_HTML) assert '</h1>' in nice and '</html>' in nice # # nice = lx.prettify(doc, method=scraper.HTML5PARSER) # missing # nice = lx.prettify(doc, method=scraper.BEAUTIFULSOUP) assert '</h1>' in nice and '</html>' in nice # nice = lx.prettify(doc, method=scraper.TIDY) assert '</h1>' in nice and '</html>' in nice # nice = lx.prettify(doc, method=None) assert nice is None
def demo3(): html = """<html> <head> <script type="text/javascript" src="stuff.js"></script> <link rel="alternate" type="text/rss" src="some-rss"> <style> body {background-image: url(javascript:do_something)}; div {color: expression(something)}; </style> </head> <body onload="some_function()"> Hello World! </body> </html>""" doc = lx.to_doc(html) print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
def test_show_paths(): doc = lx.to_doc(ex.HTML_1) old_stdout = sys.stdout buf = StringIO() sys.stdout = buf # lx.show_paths(doc, find=None) assert "'Want This' => /html/body/table/tr[2]/td" in buf.getvalue() # buf = StringIO() sys.stdout = buf lx.show_paths(doc, find='Google.ca') assert "'Google.ca' => /html/body/a" in buf.getvalue() # buf.close() sys.stdout = old_stdout
def demo7(): text = """<html> <body <div></div> <div id="content"> <ul> <li>First item</li> <li>Second item</li> </ul> </div> </body> </html>""" doc = lx.to_doc(text) lx.show_paths(doc) for tag in doc.cssselect("div#content ul li"): print tag.text print lx.css_to_xpath("div#content ul li") lx.open_in_browser(doc)
def demo2(): url = "http://projecteuler.net/" text = get_page(url) doc = lx.to_doc(text) lx.make_links_absolute(doc, base_url=url) print lx.tostring(doc)
def test_flatten(): doc = lx.to_doc(ex.HTML_1) assert lx.flatten(doc) == 'HeaderWant ThisGoogle.ca\n'
def demo3(): doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP) print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
def demo2(): doc = lx.to_doc(text, parser=scraper.HTML5PARSER) print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
def demo1(): doc = lx.to_doc(text) print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
Demo for lx.py. Download population of countries. """ import re from jabbapylib.web.scraper import lx from jabbapylib.web.web import get_page def process(doc): data = {} for row in doc.cssselect('tr'): cols = row.cssselect('td') if cols: rank = cols[0].text if rank and re.search('^\d+$', rank): country = cols[1].cssselect('a[title]')[0].text population = int(cols[2].text.replace(',', '')) data[country] = population print data ############################################################################# if __name__ == "__main__": url = 'https://secure.wikimedia.org/wikipedia/en/wiki/List_of_countries_by_population' text = get_page(url) doc = lx.to_doc(text) process(doc)
def test_autolink(): doc = lx.to_doc(ex.TEXT) doc = lx.autolink(doc) html = lx.tostring(doc) assert '<a href="http://retrogames.com/games/commando">http://retrogames.com/games/commando</a>' in html
def test_tostring(): doc = lx.to_doc(ex.HTML_1) html = lx.tostring(doc) assert type(html) is str and len(html) > 0
def test_make_links_absolute(): doc = lx.to_doc(ex.LINKS) doc = lx.make_links_absolute(doc, base_url='http://retrogames.com') html = lx.tostring(doc) assert "http://retrogames.com/games/elite" in html assert "http://retrogames.com/games/commando" in html
def test_doc_to_soup(): doc = lx.to_doc(ex.HTML_1) soup = bs.doc_to_soup(doc) assert isinstance(soup, BeautifulSoup)