def test_lxml(): url = 'http://example.webscraping.com/view/United-Kingdom-239' html = download(url) tree = lxml.html.fromstring(html) td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0] area = td.text print area
def test_beautifulsoup(): url = 'http://example.webscraping.com/view/United-Kingdom-239' html = download(url) #以下四行要理解 soup = BeautifulSoup(html, 'html.parser') tr = soup.find(attrs={'id': 'places_area__row'}) #大致推断出find格式 attrs={'属性':'值'} td = tr.find(attrs={'class': 'w2p_fw'}) area = td.text print area
def link_crawler(seed_url, delay=0, max_depth=-1, max_urls=-1, page_num=1, user_agent='Sogou spider', scrape_callback=None): crawl_queue = [seed_url] seen = {seed_url: 0} num_urls = 0 links = get_pages(page_num) # 获取page的url # links的主要作用:完成多页抓取功能,又不造成多余的循环判断 rp = get_robots(seed_url) throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): throttle.wait(url) html = download(url) if re.search('[0-9]{12}', url): # 只统计符合条件的链接,并写入文件 if scrape_callback: scrape_callback.call_lianjia(url, html) num_urls += 1 if num_urls == max_urls: break depth = seen[url] if depth != max_depth: links.extend(get_links(html)) # 获取要抓取的url,并扩展 for link in links: if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) links = [] # for循环完成后已存入队列,需清空links,否则会扩展得越来越长,避免冗余判断 else: print 'Blocked by robots.txt:', url print '抓取的链接数:' + str(num_urls)
def page_crawler(seed_url, delay=0, max_depth=-1, max_urls=-1, user_agent='Sogou spider', scrape_callback=None): crawl_queue = [seed_url] seen = {seed_url: 0} num_urls = 0 rp = get_robots(seed_url) throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): throttle.wait(url) html = download(url) if re.search('[0-9]{12}', url): if scrape_callback: scrape_callback.call_lianjia(url, html) num_urls += 1 if num_urls == max_urls: break depth = seen[url] if depth != max_depth: for link in get_links(html): if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) else: print 'Blocked by robots.txt:', url print '抓取的链接数:' + str(num_urls) return num_urls
def lxml_xpath_scraper(html): tree = fromstring(html) results = {} for field in FIELDS: results[field] = tree.xpath( '//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content() return results from link_crawler import download NUM_ITERATIONS = 1000 # number of times to test each scraper html = download( 'http://example.webscraping.com/places/default/view/Afghanistan-1') from link_crawler import download scrapers = [ # ('Regular expressions', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper), ('Xpath', lxml_xpath_scraper) ] for name, scraper in scrapers: # record start time of scrape start = time.time() for i in range(NUM_ITERATIONS): if scraper == re_scraper: re.purge() result = scraper(html) # check scraped result is as expected
def get_totalnum(url): html = download(url) tree = lxml.html.fromstring(html) totalnum = tree.cssselect( 'div.resultDes.clear > h2.total.fl > span')[0].text_content() return int(totalnum)
import lxml.html def lxml_scrap(html): tree = lxml.html.fromstring(html) results = {} for field in Field: results[field] = tree.cssselect('table>tr#places_%s__row > td.w2p_fw' % field)[0].text_content() return results if __name__ == '__main__': import time from link_crawler import download NUM_ITERATIONS = 1000 html = download('http://example.webscraping.com' '/places/view/United-Kingdom-239') for name, scraper in [('regular expressions', re_scrap), ('beautifulsoup', bs_scrap), ('lxml', lxml_scrap)]: start = time.time() for i in range(NUM_ITERATIONS): if scraper == re_scrap: re.purge() result = scraper(html) #print result assert (result['area'] == '244,820 square kilometres') end = time.time() print '%s: %.2f seconds' % (name, end - start)
# -*- coding: UTF-8 -*- from link_crawler import download import re from bs4 import BeautifulSoup import lxml.html import time html = download('http://example.webscraping.com/places/default/view/239') print re.findall('<td class="w2p_fw">(.*?)</td>', html)[1] print re.findall( '<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>', html)[0] fields = ('area', 'population', 'iso', 'country', 'capital', 'continent', 'tld', 'currency_code', 'currency_name', 'phone', 'postal_code_format', 'postal_code_regex', 'languages', 'neighbours') def re_scraper(html): results = {} for field in fields: results[field] = re.search( '<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>' % field, html).groups()[0] return results def bs_scraper(html): soup = BeautifulSoup(html, "html.parser") results = {} for field in fields:
from bs4 import BeautifulSoup from link_crawler import download url = 'http://example.webscraping.com/places/default/view/Afghanistan-1' html = download(url) soup = BeautifulSoup(html, "lxml") tr = soup.find(attrs={'id':'places_area__row'}) td = tr.find(attrs={'class': 'w2p_fw'}) # locate the data element area = td.text # extract the text from the data element print(area)
def test_re(): url = 'http://example.webscraping.com/view/United-Kingdom-239' html = download(url) print re.findall( '<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>', html)[0]