Пример #1
0
    def run(self):
        """thread function
        """

        while True:
            logging.info("[Queue_num before get:%d]" % self.url_queue.qsize())
            url, depth = self.url_queue.get(block=True, timeout=3)

            try:
                response = urllib2.urlopen(url, timeout=self.crawl_timeout)
                page_content = response.read()
            except urllib2.URLError as e:
                logging.error("request of url:%s failed, [Exception]:%s, [Queue_num:%d]" \
                        % (url, e, self.url_queue.qsize()))
                self.url_queue.task_done()
                continue
            except socket.timeout as e:
                logging.error("request of url:%s failed, [Exception]:%s, [Queue_num:%d]" \
                        % (url, e, self.url_queue.qsize()))
                self.url_queue.task_done()
                continue
            except Exception as e:
                logging.error("request of url:%s failed, [Exception]:%s, [Queue_num:%d]" % \
                        (url, e, self.url_queue.qsize()))
                self.url_queue.task_done()
                continue

            # decoding page, return unicode page and encoding of the page
            page, encoding = encoding_adaptor.decode(page_content)

            # save webpage
            webpage_save.save(url, page, encoding, self.save_dir)

            # if depth does not reach max_depth, extract the urls
            if depth < self.max_depth:
                page_parser = webpage_parse.PageParser()
                page_parser.feed(page)
                extracted_urls = page_parser.extract_urls_from_page(url)
                self.add_urls_to_queue(extracted_urls, depth + 1)

            self.url_queue.task_done()
            logging.info("[Queue_num after get:%d]" % self.url_queue.qsize())

            # 为了防止ip被封禁
            time.sleep(self.crawl_interval)
Пример #2
0
 def testWebpageSave(self):
     """ test webpage_save module
     """
     logger = logging.getLogger('testlogger')  
     logger.setLevel(logging.DEBUG)
     ch = logging.StreamHandler()  
     ch.setLevel(logging.DEBUG) 
     formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 
     ch.setFormatter(formatter)
     logger.addHandler(ch)  
     resno, resinfo = webpage_save.save('http://www.baidu.com', 'abdc', 'testoutput', logger)
     self.assertEqual(resno, 0)
Пример #3
0
 def test_save_failure(self):
     """ test save function failure """
     log.init_log("./log/webpage_save_test", logging.DEBUG)
     resno, resinfo = webpage_save.save("http://www.baidu.com", "abdc", "urls", logging)
     self.assertNotEqual(resno, 0)
Пример #4
0
 def test_save_success(self):
     """ test save function success """
     log.init_log("./log/webpage_save_test", logging.DEBUG)
     resno, resinfo = webpage_save.save("http://www.baidu.com", "abdc", "testoutput", logging)
     self.assertEqual(resno, 0)