Exemplo n.º 1
0
def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)</loc>', sitemap)
    # download each link
    for link in links:
        html = download(link)
Exemplo n.º 2
0
def iteration():
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/view/-%d' % page
        html = download(url)
        if html is None:
            break
        else:
            # success - can scrape the result
            pass
Exemplo n.º 3
0
def iteration():
    num_errors = 0
    for page in itertools.count(1):
    # 使用itertools中的count()生成一个无限迭代器
        url = 'http://example.webscraping.com/view/-{}'.format(page)
        html = download(url)
        if html is None:
            num_errors += 1
            if num_errors > 5:        #连续5次下载错误才会停止遍历ID
                break
        else:
            num_errors = 0
Exemplo n.º 4
0
def link_crawler(seed_url, link_regex):
    """Crawl from the given seed url following links matched by link_regex"""
    crawl_queue = [seed_url]  # 新建一个列表
    print crawl_queue
    while crawl_queue:
        url = crawl_queue.pop()  # 移除列表后一个元素,并返回该元素
        html = download(url)
        # print crawl_queues
        """filter for links matching our regular expression"""
        for link in get_links(html):
            if re.match(link_regex, link):
                # add this link to crawl queue
                # link = urlparse.urljoin(seed_url, link)   # form absolute link
                crawl_queue.append(link)
Exemplo n.º 5
0
def link_crawler(seed_url, link_regex):
    crawl_queue = [seed_url]
    seen = set(crawl_queue)   # keep track which URL's have seen before,创建一个集合
    # print seen
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            # print link
            if re.match(link_regex, link):
                # print 'success'
                link = urlparse.urljoin(seed_url, link)
                if link not in seen:     # check if have already seen this link
                    seen.add(link)
                    crawl_queue.append(link)
Exemplo n.º 6
0
def main():
    num_iterations = 100  # number of times to test each scraper
    html = download(
        'http://example.webscraping.com/places/default/view/China-47')
    for name, scraper in \
            [('Regular Expression', re_scraper), ('BeautifulSoup', bs_scraper), ('Lxml', lxml_scraper)]:
        # record start time of scrape
        start = time.time()
        for i in range(num_iterations):
            if scraper == re_scraper:
                re.purge()  # 正则表达式会缓存搜索结果,保证公平以清楚缓存
            result = scraper(html)
            # check scrapped result is as expected,assert()为断言函数,语句表示一定为真
            assert (result['area'] == '9,596,960 square kilometres')
        # record end time of scrape and output the total
        end = time.time()
        print '%s: %.2f seconds' % (name, end - start)
Exemplo n.º 7
0
def scrape(url):
    html = download(url)
    area = re.findall(
        '<tr id="places_area__row">.*?<td\s*class=["\']w2p_fw["\']>(.*?)</td>',
        html)[0]
    return area