def test_parse_links_from_page(self):
        """test parse_links_from_page method"""
        content = """
        <div>
        <a href="a.cpp"> </a>
        <img src="image.jpg" style="display: block;">
        </div>
        """
        url = "http://pycm.baidu.com:8081/3/page3_4.html"

        links = spider_util.parse_links_from_page(content, url)
        self.assertItemsEqual(links, ['http://pycm.baidu.com:8081/3/a.cpp',
                                      'http://pycm.baidu.com:8081/3/image.jpg'])
示例#2
0
    def process(self, url, depth):
        """
        处理单个页面抓取任务

        Args:
            url: 页面路径
            depth: 当前页面深度
        """
        logging.info("thread [%d] process begin, url: %s, depth: %s", self.idx, url, depth)
        content = self.page_downloader.download(url)
        if content:
            self.page_saver.save_to_file(url, content)
            links = spider_util.parse_links_from_page(content, url)
            for link in links:
                if link not in self.url_set:
                    self.url_queue.put((link, depth + 1))
                    self.url_set.add(link)
        logging.info("thread [%d]  process end, url: %s, depth: %s", self.idx, url, depth)