Пример #1
0
class Crawler():
    def __init__(self, myconfig):
        # 线程池, 
        self.thread_pool = ThreadPool(myconfig.threadnum)
        # 已访问的url集合
        self.visited_urls = set()
        # set 不是线程安全,所以这里加一把锁
        self.visited_urls_lock = threading.Lock()
        # 未访问的url集合
        self.will_visited_urls = deque()
        self.will_visited_urls.append(myconfig.url)
        self.temp_q = deque()
        self.cur_depth = 0
        self.status = ""
        self.myconfig = myconfig
        MyLogger(myconfig.logfile, myconfig.loglevel)
        #MyLogger(myconfig.logfile, loglevel = 5)  # debug
        self.db = Db()
        
    
    def start(self):
        self.status = "start"
        while self.cur_depth < self.myconfig.depth:
            if self.status == "stop":
                break
            try:
                while self.will_visited_urls:
                    url = self.will_visited_urls.popleft()
                    # 添加工作,这里基本上没有阻塞,因为是在主线程里,只是负责
                    # 添加工作,真正执行工作是在线程里做的
                 
                    self.thread_pool.add_job(self.handler, url)
                #
                # TODO:
                # 通知线程有活干了,这里可以看出是在将will_visited_urls的url
                # 都添加后才通知线程去干活的,这样设计,粒度似乎有点粗?
                # 如果还想节省时间的话,可以在url的数目 >= 线程初始数目的时候,就通知
                # 线程池里的线程开始干活,如果url的数目 < 线程初始数目的时候,等都
                # 添加完之后,再通知
                
                #print ">>>>>>>>  give event to threads in thread pool"
                # 通知线程池里的线程开始新一轮的抓取
                self.thread_pool.event_do_job()
                # 主动退出调度,让子线程有时间可以执行
                time.sleep(3)
            except Empty:
                # 需要访问的url没有了
                logging.info("no url right now")
            finally:
                
                # 必须等线程池里的线程工作做完之后,才算本次深度的访问结束
                # 这里做的处理是如果线程池里面有线程,则睡3s,再读,
                # 直到线程池里的工作线程为0才停下来
                # 这样才算本次深度的抓取完毕
                while True:
                    #print "thread waiting num is %d, config thread num is %d" % (self.thread_pool.get_thread_waiting_num(), self.myconfig.thread)
                    if self.thread_pool.get_thread_waiting_num() == self.myconfig.threadnum:
                        # 如果等待的线程数目等于线程初始数目,则说明,所有线程都执行完毕
                        # 所以break
                        break
                    else:
                        # 有线程仍然在执行,则说明, 本次深度的访问还没有结束
                        # 睡眠等待
                        time.sleep(10)
                #此次深度的访问结束,深度加一
                self.cur_depth += 1
                logging.info("crawler depth now is %s" % str(self.cur_depth))
                if self.cur_depth > self.myconfig.depth:
                    break
                # 从url中抓到的网页都放到了temp_q中,
                # 将temp_q中的网页从新给 will_visited_urls,继续
                self.will_visited_urls = self.temp_q
                self.temp_q = deque()
                
                
        # 所有深度的url都抓取完毕 or 爬虫退出
        self.thread_pool.stop_threads()
        logging.info("crawler exit")
        return
        
            
    def handler(self, url):
        content= self.get_html_content(url)
        if content == "" or content == None:
            # 无法获取content,直接返回
            return
        # 添加此url为已访问过
        self.add_url_to_visited(url)
        if content.find(self.myconfig.key) != -1:
            self.db.save_data(url, self.myconfig.key, content)
        try:
            hrefs = self.get_hrefs(content, url)
        except StandardError, se:
            logging.error("error: %s" % (se))
            print se
            # log
            # 无法获取 hrefs
            return
        # 如果获得了hrefs
        if hrefs:
            # 将hrefs添加到 temp_q中,等本级深度访问完毕之后再访问
            for link in hrefs:
                # 最后的考验
                if not self.is_url_visited(link) \
                            and link not in self.will_visited_urls \
                            and link not in self.temp_q:
                    #print "put %s into temp_q" % link 
                    self.temp_q.append(link)
class Crawler(object):

    def __init__(self, args):
        self.thread_num = args.thread_num
        self.output = args.output
        if not os.path.exists(self.output):
            os.mkdir(self.output)

        self.domain_pattern = re.compile(
            r"^([0-9a-zA-Z][0-9a-zA-Z-]{0,62}\.)+([0-9a-zA-Z][0-9a-zA-Z-]{0,62})\.?$")

    def _init(self):
        # 线程池,指定线程数
        self.thread_pool = ThreadPool(self.thread_num)
        self.depth = 2
        # 标注初始爬虫深度,从1开始
        self.current_depth = 1
        # 已访问的链接
        self.visited_hrefs = set()
        # 待访问的链接
        self.unvisited_hrefs = deque()
        # 标记爬虫是否开始执行任务
        self.is_crawling = False
        self.resource_details = ResourceDetailCollection()

    def _format_url(self, raw_value):
        raw_value_str = raw_value.strip().strip('\n')
        if len(raw_value_str) <= 0:
            return ''
        if not self.domain_pattern.match(raw_value_str):
            return ''
        if not raw_value_str.startswith('http'):
            value = 'http://' + raw_value_str
        else:
            value = raw_value_str
        return value

    def crawl(self, url):
        self._init()
        formatted_url = self._format_url(url)
        self.resource_details.set_main_frame_url(formatted_url)
        self.unvisited_hrefs.append(formatted_url)
        print '\nStart Crawling url %s\n' % formatted_url
        self.is_crawling = True
        self.thread_pool.start_threads()
        while self.current_depth < self.depth + 1:
            # 分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
            self._assigin_current_depth_tasks()
            # 等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
            # self.thread_pool.task_join()可代替以下操作,可无法Ctrl-C Interupt
            while self.thread_pool.get_task_left():
                time.sleep(8)
            print 'Depth %d Finish. Totally visited %d links. \n' % (
                self.current_depth, len(self.visited_hrefs))
            log.info('Depth %d Finish. Total visited Links: %d\n' % (
                self.current_depth, len(self.visited_hrefs)))
            self.current_depth += 1
        # After finishing all the tasks, stop this crawling.
        print "all Tasks has finished"
        self._on_all_tasks_finished()
        self.stop()

    def stop(self):
        self.is_crawling = False
        self.thread_pool.stop_threads()

    def get_already_visited_num(self):
        # visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
        # 因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
        return len(self.visited_hrefs) - self.thread_pool.get_task_left()

    def _on_all_tasks_finished(self):
        resource_detail_data = unicode(json.dumps(
            self.resource_details.to_json_data(), indent=4))
        hashed_file_name = hashlib.new("md5",
                                       self.resource_details.main_frame_url).hexdigest() + ".json"
        resource_detail_dataPath = os.path.join(self.output, hashed_file_name)
        with io.open(resource_detail_dataPath, 'w') as file:
            file.write(unicode(resource_detail_data))

    def _assigin_current_depth_tasks(self):
        mylock.acquire()
        copied_unvisited_hrefs = deque()
        while self.unvisited_hrefs:
            copied_unvisited_hrefs.append(self.unvisited_hrefs.popleft())
        mylock.release()
        while copied_unvisited_hrefs:
            url = copied_unvisited_hrefs.popleft()
            # 标注该链接已被访问,或即将被访问,防止重复访问相同链接
            self.visited_hrefs.add(url)
            # 向任务队列分配任务
            self.thread_pool.put_task(self._task_handler, url)

    def _task_handler(self, url):
        # 先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
        url_fetcher = URLFetcher(url)
        retry = 1
        if url_fetcher.fetch(retry):
            self._save_task_results(url, url_fetcher)
            self._add_unvisited_hrefs(url_fetcher)

    def _save_task_results(self, url, url_fetcher):
        print 'Visited URL : %s \n' % url
        response_headers = url_fetcher.get_response_headers()
        response_detail = ResourceDetail(url,
                                         url_fetcher.request_time,
                                         url_fetcher.response_time,
                                         response_headers)
        mylock.acquire()
        self.resource_details.add_detail(response_detail)
        mylock.release()

    def _add_unvisited_hrefs(self, url_fetcher):
        '''添加未访问的链接。将有效的url放进UnvisitedHrefs列表'''
        # 对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次
        url, page_source = url_fetcher.get_data()
        hrefs = self.get_all_resource_hrefs(url, page_source)
        mylock.acquire()
        for href in hrefs:
            if self._is_http_or_https_protocol(href):
                if not self._is_href_repeated(href):
                    self.unvisited_hrefs.append(href)
        mylock.release()

    def get_all_resource_hrefs(self, url, page_source):
        '''解析html源码,获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(page_source)
        results = soup.find_all(True)

        for tag in results:
            href = None
            if tag.name == 'a':
                continue
            # 必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf
            # 在bs4中不会被自动url编码,从而导致encodeException
            if tag.has_attr('href'):
                href = tag.get('href').encode('utf8')
            elif tag.has_attr('src'):
                href = tag.get('src').encode('utf8')
            if href is not None:
                if not href.startswith('http'):
                    href = urljoin(url, href)  # 处理相对链接的问题
                hrefs.append(href)
        return hrefs

    def _is_http_or_https_protocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False

    def _is_href_repeated(self, href):
        if href in self.visited_hrefs or href in self.unvisited_hrefs:
            return True
        return False