Python to_doc示例，jabbapylib.web.scraper.lx.to_doc Python示例

示例#1

0

显示文件

def test_to_doc():
    #doc = lx.to_doc(ex.HTML_1)    # could be this too
    doc = lx.to_doc(ex.HTML_1, scraper.LXML_HTML)  # this parser is the default
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.HTML5PARSER)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.BEAUTIFULSOUP)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, parser=None)
    assert doc is None
    #
    # now let's see with HTML fragments
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.LXML_HTML, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.HTML5PARSER, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.BEAUTIFULSOUP, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, parser=None, whole_doc=False)
    assert doc is None

示例#2

0

显示文件

文件： test_lx.py 项目： ThePenguin1140/jabbapylib

def test_to_doc():
    #doc = lx.to_doc(ex.HTML_1)    # could be this too
    doc = lx.to_doc(ex.HTML_1, scraper.LXML_HTML)    # this parser is the default
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.HTML5PARSER)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, scraper.BEAUTIFULSOUP)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.HTML_1, parser=None)
    assert doc is None
    #
    # now let's see with HTML fragments
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.LXML_HTML, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.HTML5PARSER, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, scraper.BEAUTIFULSOUP, whole_doc=False)
    assert isinstance(doc, lxml.html.HtmlElement)
    #
    doc = lx.to_doc(ex.FRAGMENT, parser=None, whole_doc=False)
    assert doc is None

示例#3

0

显示文件

文件： lx_wallbase.py 项目： ThePenguin1140/jabbapylib

def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)
    
    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)
    
    return images

示例#4

0

显示文件

文件： hyphen.py 项目： the7day/jabbapylib

def process(word):
    """Process the given word.

    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)

    return (word, get_hyphen(doc), get_mp3(doc))

示例#5

0

显示文件

def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)

    return [x for x in li if x]  # remove None elems

示例#6

0

显示文件

文件： lx_wallbase.py 项目： ThePenguin1140/jabbapylib

def extract_images_from_pages(pages):
    """Extract images from subpages."""
    li = []
    for page in pages:
        doc = lx.to_doc(get_page(page))
        image = get_jpg_image(doc)
        li.append(image)
        
    return [x for x in li if x]     # remove None elems

示例#7

0

显示文件

文件： hyphen.py 项目： ThePenguin1140/jabbapylib

def process(word):
    """Process the given word.
    
    The return value is a tuple: (word, hyphenation, pronunciation mp3)."""
    url = _template.format(word=word)
    html = web.get_page(url, user_agent=True)
    doc = lx.to_doc(html)
    
    return (word, get_hyphen(doc), get_mp3(doc))

示例#8

0

显示文件

def get_image_url_list(url):
    """Controller function for getting the URLs of the JPG images."""
    text = get_page(url)
    doc = lx.to_doc(text)

    subpages = get_subpages(doc)
    images = extract_images_from_pages(subpages)

    return images

示例#9

0

显示文件

文件： lx_simple.py 项目： jeffreywinn/jabbapylib

def demo6():
    text = """<ul>
<li>abc</li>
<li>def
<li>ghi</li>
</ul>"""
    doc = lx.to_doc(text)
    for li in doc.cssselect("ul li"):
        print li.text.strip()

示例#10

0

显示文件

文件： weather.py 项目： ThePenguin1140/jabbapylib

def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content() 
    print celsius

示例#11

0

显示文件

文件： weather.py 项目： the7day/jabbapylib

def process(url):
    text = get_page(url, user_agent=True)
    doc = lx.to_doc(text)
    #lx.show_paths(doc, find='Montreal, Quebec')
    tag = doc.cssselect('h1#locationName.brTopLeft5')[0]
    city = tag.text
    print city
    tag = doc.cssselect('div#tempActual span.pwsrt span.nobr')[0]
    celsius = tag.text_content()
    print celsius

示例#12

0

显示文件

文件： lx_simple.py 项目： jeffreywinn/jabbapylib

def demo8():
    url = "http://python.org/"
    text = get_page(url)
    # doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    # doc = lx.to_doc(text)
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    # print type(doc)
    # print etree.tostring(doc)
    title = doc.cssselect("html head title")[0]
    print title.text

示例#13

0

显示文件

文件： lx_simple.py 项目： jeffreywinn/jabbapylib

def demo5():
    text = """
<html>
    <table>
        <tr><td>http://google.ca</td></tr>
        <tr><td>http://reddit.com</td></tr>
    </table>
</html>
"""
    doc = lx.to_doc(text)
    lx.show_paths(doc)

示例#14

0

显示文件

文件： lx_simple.py 项目： jeffreywinn/jabbapylib

def demo4():
    text = """
<html>
    <table>
        <tr><td>http://google.ca</td></tr>
        <tr><td>http://reddit.com</td></tr>
    </table>
</html>
"""
    doc = lx.to_doc(text)
    doc = lx.autolink(doc)
    print lx.prettify(doc)

示例#15

0

显示文件

文件： lx_simple.py 项目： jeffreywinn/jabbapylib

def demo1():
    text = """
<html>
    <table>
        <tr><td>Header</td></tr>
        <tr><td>Want This</td></tr>
    </table>
    <a href="http://google.ca">Google.ca</a>
</html>
"""
    doc = lx.to_doc(text)
    row1 = doc.cssselect("table")[0]
    print row1.cssselect("tr td")[0].text
    print doc.cssselect("a[href]")[0].get("href")

示例#16

0

显示文件

def test_prettify():
    doc = lx.to_doc(ex.UGLY, parser=scraper.LXML_HTML)
    #
    nice = lx.prettify(doc, method=scraper.LXML_HTML)
    assert '</h1>' in nice and '</html>' in nice
    #
    #    nice = lx.prettify(doc, method=scraper.HTML5PARSER)    # missing
    #
    nice = lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=scraper.TIDY)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=None)
    assert nice is None

示例#17

0

显示文件

文件： test_lx.py 项目： ThePenguin1140/jabbapylib

def test_prettify():
    doc = lx.to_doc(ex.UGLY, parser=scraper.LXML_HTML)
    #
    nice = lx.prettify(doc, method=scraper.LXML_HTML)
    assert '</h1>' in nice and '</html>' in nice
    #
#    nice = lx.prettify(doc, method=scraper.HTML5PARSER)    # missing
    #
    nice = lx.prettify(doc, method=scraper.BEAUTIFULSOUP)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=scraper.TIDY)
    assert '</h1>' in nice and '</html>' in nice
    #
    nice = lx.prettify(doc, method=None)
    assert nice is None

示例#18

0

显示文件

文件： lx_simple.py 项目： jeffreywinn/jabbapylib

def demo3():
    html = """<html>
  <head>
    <script type="text/javascript" src="stuff.js"></script>
    <link rel="alternate" type="text/rss" src="some-rss">
    <style>
        body {background-image: url(javascript:do_something)};
        div {color: expression(something)};
    </style>
  </head>
  <body onload="some_function()">
     Hello World!
  </body>
 </html>"""
    doc = lx.to_doc(html)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

示例#19

0

显示文件

文件： test_lx.py 项目： ThePenguin1140/jabbapylib

def test_show_paths():
    doc = lx.to_doc(ex.HTML_1)
    
    old_stdout = sys.stdout
    buf = StringIO()
    sys.stdout = buf 
    #
    lx.show_paths(doc, find=None)
    assert "'Want This' => /html/body/table/tr[2]/td" in buf.getvalue()
    #
    buf = StringIO()
    sys.stdout = buf
    lx.show_paths(doc, find='Google.ca')
    assert "'Google.ca' => /html/body/a" in buf.getvalue()
    #
    buf.close()
    sys.stdout = old_stdout

示例#20

0

显示文件

def test_show_paths():
    doc = lx.to_doc(ex.HTML_1)

    old_stdout = sys.stdout
    buf = StringIO()
    sys.stdout = buf
    #
    lx.show_paths(doc, find=None)
    assert "'Want This' => /html/body/table/tr[2]/td" in buf.getvalue()
    #
    buf = StringIO()
    sys.stdout = buf
    lx.show_paths(doc, find='Google.ca')
    assert "'Google.ca' => /html/body/a" in buf.getvalue()
    #
    buf.close()
    sys.stdout = old_stdout

示例#21

0

显示文件

文件： lx_simple.py 项目： jeffreywinn/jabbapylib

def demo7():
    text = """<html>
 <body
  <div></div>
  <div id="content">
   <ul>
    <li>First item</li>
    <li>Second item</li>
   </ul>
  </div>
 </body>
</html>"""
    doc = lx.to_doc(text)
    lx.show_paths(doc)
    for tag in doc.cssselect("div#content ul li"):
        print tag.text
    print lx.css_to_xpath("div#content ul li")
    lx.open_in_browser(doc)

示例#22

0

显示文件

文件： lx_simple.py 项目： jeffreywinn/jabbapylib

def demo2():
    url = "http://projecteuler.net/"
    text = get_page(url)
    doc = lx.to_doc(text)
    lx.make_links_absolute(doc, base_url=url)
    print lx.tostring(doc)

示例#23

0

显示文件

def test_flatten():
    doc = lx.to_doc(ex.HTML_1)
    assert lx.flatten(doc) == 'HeaderWant ThisGoogle.ca\n'

示例#24

0

显示文件

def demo3():
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

示例#25

0

显示文件

def demo2():
    doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

示例#26

0

显示文件

def demo1():
    doc = lx.to_doc(text)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

示例#27

0

显示文件

Demo for lx.py.
Download population of countries.
"""

import re

from jabbapylib.web.scraper import lx
from jabbapylib.web.web import get_page


def process(doc):
    data = {}
    
    for row in doc.cssselect('tr'):
        cols = row.cssselect('td')
        if cols:
            rank = cols[0].text
            if rank and re.search('^\d+$', rank):
                country = cols[1].cssselect('a[title]')[0].text
                population = int(cols[2].text.replace(',', ''))  
                data[country] = population
                
    print data

#############################################################################

if __name__ == "__main__":
    url = 'https://secure.wikimedia.org/wikipedia/en/wiki/List_of_countries_by_population'
    text = get_page(url)
    doc = lx.to_doc(text)
    process(doc)

示例#28

0

显示文件

def test_autolink():
    doc = lx.to_doc(ex.TEXT)
    doc = lx.autolink(doc)
    html = lx.tostring(doc)
    assert '<a href="http://retrogames.com/games/commando">http://retrogames.com/games/commando</a>' in html

示例#29

0

显示文件

文件： lx_parsers.py 项目： ThePenguin1140/jabbapylib

def demo2():
    doc = lx.to_doc(text, parser=scraper.HTML5PARSER)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

示例#30

0

显示文件

文件： lx_parsers.py 项目： ThePenguin1140/jabbapylib

def demo1():
    doc = lx.to_doc(text)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

示例#31

0

显示文件

def test_tostring():
    doc = lx.to_doc(ex.HTML_1)
    html = lx.tostring(doc)
    assert type(html) is str and len(html) > 0

示例#32

0

显示文件

文件： test_lx.py 项目： ThePenguin1140/jabbapylib

def test_autolink():
    doc = lx.to_doc(ex.TEXT)
    doc = lx.autolink(doc)
    html = lx.tostring(doc)
    assert '<a href="http://retrogames.com/games/commando">http://retrogames.com/games/commando</a>' in html

示例#33

0

显示文件

def test_make_links_absolute():
    doc = lx.to_doc(ex.LINKS)
    doc = lx.make_links_absolute(doc, base_url='http://retrogames.com')
    html = lx.tostring(doc)
    assert "http://retrogames.com/games/elite" in html
    assert "http://retrogames.com/games/commando" in html

示例#34

0

显示文件

文件： test_bs.py 项目： the7day/jabbapylib

def test_doc_to_soup():
    doc = lx.to_doc(ex.HTML_1)
    soup = bs.doc_to_soup(doc)
    assert isinstance(soup, BeautifulSoup)

示例#35

0

显示文件

文件： lx_parsers.py 项目： ThePenguin1140/jabbapylib

def demo3():
    doc = lx.to_doc(text, parser=scraper.BEAUTIFULSOUP)
    print lx.prettify(doc, method=scraper.BEAUTIFULSOUP)

示例#36

0

显示文件

文件： test_lx.py 项目： ThePenguin1140/jabbapylib

def test_flatten():
    doc = lx.to_doc(ex.HTML_1)
    assert lx.flatten(doc) == 'HeaderWant ThisGoogle.ca\n'

示例#37

0

显示文件

文件： test_lx.py 项目： ThePenguin1140/jabbapylib

def test_make_links_absolute():
    doc = lx.to_doc(ex.LINKS)
    doc = lx.make_links_absolute(doc, base_url='http://retrogames.com')
    html = lx.tostring(doc)
    assert "http://retrogames.com/games/elite" in html
    assert "http://retrogames.com/games/commando" in html

示例#38

0

显示文件

文件： test_bs.py 项目： jeffreywinn/jabbapylib

def test_doc_to_soup():
    doc = lx.to_doc(ex.HTML_1)
    soup = bs.doc_to_soup(doc)
    assert isinstance(soup, BeautifulSoup)

示例#39

0

显示文件

文件： test_lx.py 项目： ThePenguin1140/jabbapylib

def test_tostring():
    doc = lx.to_doc(ex.HTML_1)
    html = lx.tostring(doc)
    assert type(html) is str and len(html) > 0