def crawl_sitemap(url): # download the sitemap file sitemap = download(url) # extract the sitemap links links = re.findall('<loc>(.*?)</loc>', sitemap) # download each link for link in links: html = download(link)
def iteration(): for page in itertools.count(1): url = 'http://example.webscraping.com/view/-%d' % page html = download(url) if html is None: break else: # success - can scrape the result pass
def iteration(): num_errors = 0 for page in itertools.count(1): # 使用itertools中的count()生成一个无限迭代器 url = 'http://example.webscraping.com/view/-{}'.format(page) html = download(url) if html is None: num_errors += 1 if num_errors > 5: #连续5次下载错误才会停止遍历ID break else: num_errors = 0
def link_crawler(seed_url, link_regex): """Crawl from the given seed url following links matched by link_regex""" crawl_queue = [seed_url] # 新建一个列表 print crawl_queue while crawl_queue: url = crawl_queue.pop() # 移除列表后一个元素,并返回该元素 html = download(url) # print crawl_queues """filter for links matching our regular expression""" for link in get_links(html): if re.match(link_regex, link): # add this link to crawl queue # link = urlparse.urljoin(seed_url, link) # form absolute link crawl_queue.append(link)
def link_crawler(seed_url, link_regex): crawl_queue = [seed_url] seen = set(crawl_queue) # keep track which URL's have seen before,创建一个集合 # print seen while crawl_queue: url = crawl_queue.pop() html = download(url) for link in get_links(html): # print link if re.match(link_regex, link): # print 'success' link = urlparse.urljoin(seed_url, link) if link not in seen: # check if have already seen this link seen.add(link) crawl_queue.append(link)
def main(): num_iterations = 100 # number of times to test each scraper html = download( 'http://example.webscraping.com/places/default/view/China-47') for name, scraper in \ [('Regular Expression', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper)]: # record start time of scrape start = time.time() for i in range(num_iterations): if scraper == re_scraper: re.purge() # 正则表达式会缓存搜索结果,保证公平以清楚缓存 result = scraper(html) # check scrapped result is as expected,assert()为断言函数,语句表示一定为真 assert (result['area'] == '9,596,960 square kilometres') # record end time of scrape and output the total end = time.time() print '%s: %.2f seconds' % (name, end - start)
def scrape(url): html = download(url) area = re.findall( '<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>', html)[0] return area