Пример #1
0
 def testcase_ThreadPool_get_result_success(self):
     """测试用例3:get_result,所有任务执行完后结果为1"""
     jobs = [i for i in xrange(2)]
     pool = ThreadPool(3, test_function, jobs, 0)
     pool.wait_allcomplete()
     sum = 0
     while True:
         try:
             res = pool.get_result()
             arr_res = json.loads(res)
             sum += int(arr_res['url'])
         except Queue.Empty as e:
             self.logging.info(e)
             break
     self.assertEqual(1, sum)
Пример #2
0
    def begin_crawl(self, conf):
        """爬虫开始的地方。。。

        Args:
        conf: 配置文件

        returns: 
        none 
        """
        ret = self.build_params(conf)
        if False == ret:
            logging.info("初始化失败,退出....")
            return False

        crawl_depth = int(self.params['max_depth'])
        #读取种子url
        arr_source_url = []
        with open(self.params['url_list_file']) as fopen:
            for line in fopen:
                arr_source_url.append(line.strip())
        if 0 == len(arr_source_url):
            logging.info("种子文件为空,请检查种子文件,我走了")
            return False

        thread_count = self.params['thread_count']
        interval = self.params['crawl_interval']
        # 设置爬虫超时时间
        socket.setdefaulttimeout(self.params['crawl_timeout'])
        for depth in xrange(crawl_depth):
            # 开始第i层的爬取,然后解析网页,提取url
            work_manager = ThreadPool(thread_count, self.crawl_html,
                                      arr_source_url, interval)
            work_manager.wait_allcomplete()
            while True:
                try:
                    logging.info("开始解析网页")
                    res = work_manager.get_result()
                    # 爬虫线程中将url和url对应的网页内容存储成json
                    res = json.loads(res)
                    origin_url, html = [res['url'], res['html']]
                    parse_url_set = Crawler.parse_html(
                        origin_url, html, self.params['target_url'])
                    next_url = parse_url_set[0]
                    js_url = parse_url_set[1]
                    ret = parse_url_set[2]

                    # 解析js文件中的目标url和下一级未访问过的url
                    for url in js_url:
                        resp = self.crawl_html(url)
                        js_parse_url_set = Crawler.parse_html(
                            url, resp, self.params['target_url'])
                        tmp_next_url = js_parse_url_set[0]
                        tmp_ret = js_parse_url_set[2]
                        next_url.extend(tmp_next_url)
                        ret.extend(tmp_ret)

                    arr_target_url = Crawler.filter_unvisited_url(
                        ret, self.target_visited_url)

                    for t_url in arr_target_url:
                        self.save_file(t_url)
                        if self.is_enough():
                            return True

                    arr_source_url = Crawler.filter_unvisited_url(
                        next_url, self.visited_url)
                except Queue.Empty as e:
                    logging.warning(e)
                    logging.info("第 %d 级抓取完毕..." % depth)
                    break

        if self.is_enough():
            return True
        else:
            return False