def run(self): """thread function """ while True: logging.info("[Queue_num before get:%d]" % self.url_queue.qsize()) url, depth = self.url_queue.get(block=True, timeout=3) try: response = urllib2.urlopen(url, timeout=self.crawl_timeout) page_content = response.read() except urllib2.URLError as e: logging.error("request of url:%s failed, [Exception]:%s, [Queue_num:%d]" \ % (url, e, self.url_queue.qsize())) self.url_queue.task_done() continue except socket.timeout as e: logging.error("request of url:%s failed, [Exception]:%s, [Queue_num:%d]" \ % (url, e, self.url_queue.qsize())) self.url_queue.task_done() continue except Exception as e: logging.error("request of url:%s failed, [Exception]:%s, [Queue_num:%d]" % \ (url, e, self.url_queue.qsize())) self.url_queue.task_done() continue # decoding page, return unicode page and encoding of the page page, encoding = encoding_adaptor.decode(page_content) # save webpage webpage_save.save(url, page, encoding, self.save_dir) # if depth does not reach max_depth, extract the urls if depth < self.max_depth: page_parser = webpage_parse.PageParser() page_parser.feed(page) extracted_urls = page_parser.extract_urls_from_page(url) self.add_urls_to_queue(extracted_urls, depth + 1) self.url_queue.task_done() logging.info("[Queue_num after get:%d]" % self.url_queue.qsize()) # 为了防止ip被封禁 time.sleep(self.crawl_interval)
def testWebpageSave(self): """ test webpage_save module """ logger = logging.getLogger('testlogger') logger.setLevel(logging.DEBUG) ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch.setFormatter(formatter) logger.addHandler(ch) resno, resinfo = webpage_save.save('http://www.baidu.com', 'abdc', 'testoutput', logger) self.assertEqual(resno, 0)
def test_save_failure(self): """ test save function failure """ log.init_log("./log/webpage_save_test", logging.DEBUG) resno, resinfo = webpage_save.save("http://www.baidu.com", "abdc", "urls", logging) self.assertNotEqual(resno, 0)
def test_save_success(self): """ test save function success """ log.init_log("./log/webpage_save_test", logging.DEBUG) resno, resinfo = webpage_save.save("http://www.baidu.com", "abdc", "testoutput", logging) self.assertEqual(resno, 0)