예제 #1
0
def test_lxml():
    url = 'http://example.webscraping.com/view/United-Kingdom-239'
    html = download(url)
    tree = lxml.html.fromstring(html)
    td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]
    area = td.text
    print area
예제 #2
0
def test_beautifulsoup():
    url = 'http://example.webscraping.com/view/United-Kingdom-239'
    html = download(url)
    #以下四行要理解
    soup = BeautifulSoup(html, 'html.parser')
    tr = soup.find(attrs={'id':
                          'places_area__row'})  #大致推断出find格式 attrs={'属性':'值'}
    td = tr.find(attrs={'class': 'w2p_fw'})
    area = td.text
    print area
예제 #3
0
def link_crawler(seed_url,
                 delay=0,
                 max_depth=-1,
                 max_urls=-1,
                 page_num=1,
                 user_agent='Sogou spider',
                 scrape_callback=None):

    crawl_queue = [seed_url]
    seen = {seed_url: 0}
    num_urls = 0

    links = get_pages(page_num)  # 获取page的url
    # links的主要作用:完成多页抓取功能,又不造成多余的循环判断
    rp = get_robots(seed_url)
    throttle = Throttle(delay)

    while crawl_queue:
        url = crawl_queue.pop()

        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url)

            if re.search('[0-9]{12}', url):  # 只统计符合条件的链接,并写入文件
                if scrape_callback:
                    scrape_callback.call_lianjia(url, html)
                num_urls += 1
                if num_urls == max_urls:
                    break

            depth = seen[url]
            if depth != max_depth:
                links.extend(get_links(html))  # 获取要抓取的url,并扩展
                for link in links:
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)
                links = []  # for循环完成后已存入队列,需清空links,否则会扩展得越来越长,避免冗余判断
        else:
            print 'Blocked by robots.txt:', url
    print '抓取的链接数:' + str(num_urls)
예제 #4
0
def page_crawler(seed_url,
                 delay=0,
                 max_depth=-1,
                 max_urls=-1,
                 user_agent='Sogou spider',
                 scrape_callback=None):

    crawl_queue = [seed_url]
    seen = {seed_url: 0}
    num_urls = 0

    rp = get_robots(seed_url)
    throttle = Throttle(delay)

    while crawl_queue:
        url = crawl_queue.pop()

        if rp.can_fetch(user_agent, url):
            throttle.wait(url)
            html = download(url)

            if re.search('[0-9]{12}', url):
                if scrape_callback:
                    scrape_callback.call_lianjia(url, html)
                num_urls += 1
                if num_urls == max_urls:
                    break

            depth = seen[url]
            if depth != max_depth:
                for link in get_links(html):
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)
        else:
            print 'Blocked by robots.txt:', url
    print '抓取的链接数:' + str(num_urls)
    return num_urls
예제 #5
0

def lxml_xpath_scraper(html):
    tree = fromstring(html)
    results = {}
    for field in FIELDS:
        results[field] = tree.xpath(
            '//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' %
            field)[0].text_content()
    return results


from link_crawler import download

NUM_ITERATIONS = 1000  # number of times to test each scraper
html = download(
    'http://example.webscraping.com/places/default/view/Afghanistan-1')
from link_crawler import download
scrapers = [
    # ('Regular expressions', re_scraper),
    ('BeautifulSoup', bs_scraper),
    ('Lxml', lxml_scraper),
    ('Xpath', lxml_xpath_scraper)
]
for name, scraper in scrapers:
    # record start time of scrape
    start = time.time()
    for i in range(NUM_ITERATIONS):
        if scraper == re_scraper:
            re.purge()
        result = scraper(html)
        # check scraped result is as expected
예제 #6
0
def get_totalnum(url):
    html = download(url)
    tree = lxml.html.fromstring(html)
    totalnum = tree.cssselect(
        'div.resultDes.clear > h2.total.fl > span')[0].text_content()
    return int(totalnum)
예제 #7
0
import lxml.html


def lxml_scrap(html):
    tree = lxml.html.fromstring(html)
    results = {}
    for field in Field:
        results[field] = tree.cssselect('table>tr#places_%s__row > td.w2p_fw' %
                                        field)[0].text_content()
    return results


if __name__ == '__main__':
    import time
    from link_crawler import download
    NUM_ITERATIONS = 1000
    html = download('http://example.webscraping.com'
                    '/places/view/United-Kingdom-239')
    for name, scraper in [('regular expressions', re_scrap),
                          ('beautifulsoup', bs_scrap), ('lxml', lxml_scrap)]:
        start = time.time()
        for i in range(NUM_ITERATIONS):
            if scraper == re_scrap:
                re.purge()
            result = scraper(html)
            #print result
            assert (result['area'] == '244,820 square kilometres')
        end = time.time()
        print '%s: %.2f seconds' % (name, end - start)
예제 #8
0
# -*- coding: UTF-8 -*-
from link_crawler import download
import re
from bs4 import BeautifulSoup
import lxml.html
import time

html = download('http://example.webscraping.com/places/default/view/239')
print re.findall('<td class="w2p_fw">(.*?)</td>', html)[1]
print re.findall(
    '<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>',
    html)[0]

fields = ('area', 'population', 'iso', 'country', 'capital', 'continent',
          'tld', 'currency_code', 'currency_name', 'phone',
          'postal_code_format', 'postal_code_regex', 'languages', 'neighbours')


def re_scraper(html):
    results = {}
    for field in fields:
        results[field] = re.search(
            '<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field,
            html).groups()[0]
    return results


def bs_scraper(html):
    soup = BeautifulSoup(html, "html.parser")
    results = {}
    for field in fields:
from bs4 import BeautifulSoup

from link_crawler import download

url = 'http://example.webscraping.com/places/default/view/Afghanistan-1'
html = download(url)
soup = BeautifulSoup(html, "lxml")
tr = soup.find(attrs={'id':'places_area__row'})
td = tr.find(attrs={'class': 'w2p_fw'})  # locate the data element
area = td.text  # extract the text from the data element
print(area)
예제 #10
0
def test_re():
    url = 'http://example.webscraping.com/view/United-Kingdom-239'
    html = download(url)
    print re.findall(
        '<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>',
        html)[0]