示例#1
0
文件: proxy.py 项目: flyninjia/python
    def saveProxies(self):
        #创建线程30个,并开启线程
        threadPool = ThreadPool(30)
        threadPool.startThreads()

        #调用类 读取数据
        #databases = database.DatabaseProxyIp()

        proxyip = self.proxyip_db.readData()
        #x循环读取数据进行匹配
        for proxy in proxyip:
            #把测试函数放入线程中
            threadPool.putTask(self.checkclientUrl, proxy[0])
            #threadPool.putTask(self.checkProxy, proxy[0])
            #flag,proxy = checkProxy(proxy[0])
        #循环获取测试结果,成功写入数据库,失败修改available为0
        ip_fail = 0
        ip_ok = 0
        ip_lock = 0
        while threadPool.getTaskLeft():
            flag, proxy = threadPool.getTaskResult()
            print flag, proxy
            if flag == 'ok':
                #print 'ok ', proxy
                self.proxyip_db.updateData(1, proxy)
                ip_ok = ip_ok + 1
            elif flag == 'lock':
                self.proxyip_db.updateData(0, proxy)
                ip_lock = ip_lock + 1
            else:
                self.proxyip_db.delData(proxy)
                ip_fail = ip_fail + 1

        print '====> available ip: ', ip_ok, ' , lock ip: ', ip_lock, ' , fail ip: ', ip_fail, ' <===='
        threadPool.stopThreads()
示例#2
0
class Crawler(object):
    
    def __init__(self,threadnum,pathname,limit):
        '''limit指定图片数目,path指定存放路径'''
        super(Crawler, self).__init__()
        self.threadPool = ThreadPool(threadnum)
        self.file = PicFile('imgfile','r')
        self.urlqueue = deque()
        self.count = 1
        self._makePath(pathname)
        self.savaPath = os.getcwd()+'/'+pathname
        self._getUrl(limit)

    '''当前目录下创建指定目录'''
    def _makePath(self,pathname):
        if not os.path.isdir(os.getcwd()+'/'+pathname):
            os.mkdir(os.getcwd()+'/'+pathname)
        else:
            pass

    '''从文件取出 URL 到双向列表'''
    def _getUrl(self,num):
        while len(self.urlqueue) < num:
            self.urlqueue.append(self.file.getData().rstrip('\n'))
        self.file.close()
        
    def start(self):
        print '---start downloading picture---'
        self.threadPool.startThreads()
        while self.urlqueue!=deque([]):
            self.threadPool.putTask(self._handleTask,self.urlqueue.popleft())
        self.stop()

    def stop(self):
        self.threadPool.stopThreads()
        print '---end downloading picture---'

    '''任务处理'''
    def _handleTask(self,url):
        self._download(url)
    
    '''下载图片,以数字升序命名'''
    def _download(self,url):
        retry = 2 
        try:
            r = requests.get(url)
            with open(self.savaPath +'/'+str(self.count)+'.jpg','wb') as jpg:
                jpg.write(r.content)
                self.count+=1
            print url
        except Exception,e:
            if retry > 0:
                retry = retry - 1
                self._download(url)
示例#3
0
def saveProxies():
    threadPool = ThreadPool(30)
    threadPool.startThreads()

    proxyFileOK = open('proxyOK.txt', 'a')
    proxyFileFail = open('proxyFail.txt', 'a')
    for proxy in proxiex:
        threadPool.putTask(checkProxy, proxy)
    while threadPool.getTaskLeft():
        flag, proxy = threadPool.getTaskResult()
        print flag, proxy
        if flag == 'ok':
            proxyFileOK.write(proxy)
            proxyFileOK.write('\n')
        else:
            proxyFileFail.write(proxy)
            proxyFileFail.write('\n')

    threadPool.stopThreads()
    proxyFileOK.close()
    proxyFileFail.close()
示例#4
0
def saveProxies():
    threadPool = ThreadPool(30)
    threadPool.startThreads()

    proxyFileOK = open('proxyOK.txt','a')
    proxyFileFail = open('proxyFail.txt','a')
    for proxy in proxiex:
        threadPool.putTask(checkProxy, proxy)
    while threadPool.getTaskLeft():
        flag, proxy = threadPool.getTaskResult()
        print flag, proxy
        if flag == 'ok':
            proxyFileOK.write(proxy)
            proxyFileOK.write('\n')
        else:
            proxyFileFail.write(proxy)
            proxyFileFail.write('\n')

    threadPool.stopThreads()
    proxyFileOK.close()
    proxyFileFail.close()
示例#5
0
class Crawler(object):
    def __init__(self, args=Strategy()):
        self.url = args.url
        self.max_depth = args.max_depth  #指定网页深度
        self.max_count = args.max_count  #爬行最大数量
        self.concurrency = args.concurrency  #线程数
        self.timeout = args.timeout  #超时
        self.cookies = args.cookies  #cookies
        self.ssl_verify = args.ssl_verify  #ssl
        self.same_host = args.same_host  #是否只抓取相同host的链接
        self.same_domain = args.same_domain  #是否只抓取相同domain的链接

        self.currentDepth = 1  #标注初始爬虫深度,从1开始
        self.keyword = args.keyword  #指定关键词,使用console的默认编码来解码

        self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数

        self.visitedHrefs = set()  #已访问的链接
        self.unvisitedHrefs = deque()  #待访问的链接
        self.unvisitedHrefs.append(args.url)  #添加首个待访问的链接
        self.isCrawling = False  #标记爬虫是否开始执行任务

        self.file = BASEDIR + '/cache/crawler/' + genFilename(
            self.url) + '.txt'
        # print self.file
        # print 'args.url=\t',args.url

        #################
        #此句有问题
        self.database = Database(args.dbFile)  #数据库
        # print 'hehe'

        self.lock = Lock()

    def start(self):
        # print '\nStart Crawling\n'
        if not self._isDatabaseAvaliable():
            # print 'Error: Unable to open database file.\n'
            pass
        else:
            pass
        if True:
            self.isCrawling = True
            self.threadPool.startThreads()
            while self.currentDepth <= self.max_depth and len(
                    self.visitedHrefs) <= self.max_count:
                #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
                self._assignCurrentDepthTasks()
                #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
                #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
                counter = 0
                while self.threadPool.getTaskLeft() and counter < 600:
                    # print '>>taskleft:\t',self.threadPool.getTaskLeft()
                    # print self.threadPool.taskQueue.qsize()
                    # print self.threadPool.resultQueue.qsize()
                    # print self.threadPool.running
                    time.sleep(1)
                    counter += 1
                # self.threadPool.taskJoin()

                # print 'Depth %d Finish. Totally visited %d links. \n' % (
                # 	self.currentDepth, len(self.visitedHrefs))
                # log.info('Depth %d Finish. Total visited Links: %d\n' % (
                # 	self.currentDepth, len(self.visitedHrefs)))
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        # self.database.close()

    def saveAllHrefsToFile(self, nonehtml=True):
        try:
            cf = CrawlerFile(url=self.url)
            contentlist = []
            hrefs = [i for i in self.visitedHrefs
                     ] + [j for j in self.unvisitedHrefs]
            for href in hrefs:
                if href.endswith('.html') and nonehtml:
                    continue
                contentlist.append(href)
            cf.saveSection('Hrefs', contentlist, coverfile=True)
            # fp = open(self.file,'w')
            # fp.write('[Hrefs]'+os.linesep)
            # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
            # rethrefs = []
            # print 'Totally ',len(hrefs), ' hrefs'
            # for href in hrefs:
            # 	if href.endswith('.html'):
            # 		continue
            # 	rethrefs.append(href)
            # 	fp.write(href + os.linesep)
            # 	print href
            # print 'Totally ',len(rethrefs), ' aviable hrefs'
            # fp.close()
        except:
            pass

    def _getCrawlerPaths(self, url):
        ''' '''
        try:
            paths = []
            baseulp = urlparse(url)

            cf = CrawlerFile(url=url)
            urls = cf.getSection('Hrefs')
            #print urls

            for eachline in urls:
                eachline = eachline.replace('\r', '')
                eachline = eachline.replace('\n', '')
                #print eachline
                eachulp = urlparse(eachline)
                if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
                    fullpath = eachulp.path
                    if fullpath.find('.') == -1 and fullpath.endswith(
                            '/') == False:
                        fullpath += '/'
                    pos = 0
                    while True:
                        # print 'fullpath=',fullpath
                        pos = fullpath.find('/', pos)
                        if pos == -1:
                            break
                        tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:
                                                                                        pos]
                        if tmppth.endswith('/'):
                            #tmppth = tmppth[:-1]
                            continue
                        if tmppth not in paths:
                            paths.append(tmppth)
                        pos += 1

            return paths
        except Exception, e:
            print 'Exception:\t', e
            return [url]
示例#6
0
class Crawler(object):
    def __init__(self, args):
        #指定网页深度
        self.depth = args.depth
        #标注初始爬虫深度,从1开始
        self.currentDepth = 1
        #指定关键词,使用console的默认编码来解码
        self.keyword = args.keyword.decode(getdefaultlocale()[1])
        #数据库
        self.database = Database()
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)
        #已访问的链接
        self.visitedHrefs = set()
        #待访问的链接
        self.unvisitedHrefs = deque()
        #添加首个待访问的链接
        self.unvisitedHrefs.append(args.url)
        #标记爬虫是否开始执行任务
        self.isCrawling = False

    def start(self):
        print '\nStart Crawling\n'
        if not self._isDatabaseAvaliable():
            print 'Error: Unable to open database file.\n'
        else:
            self.isCrawling = True
            self.threadPool.startThreads()
            while self.currentDepth < self.depth + 1:
                #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
                self._assignCurrentDepthTasks()
                #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
                #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
                while self.threadPool.getTaskLeft():
                    time.sleep(8)
                print 'Depth %d Finish. Totally visited %d links. \n' % (
                    self.currentDepth, len(self.visitedHrefs))
                log.info('Depth %d Finish. Total visited Links: %d\n' %
                         (self.currentDepth, len(self.visitedHrefs)))
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()

    def getAlreadyVisitedNum(self):
        #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
        return len(self.visitedHrefs) - self.threadPool.getTaskLeft()

    def _assignCurrentDepthTasks(self):
        while self.unvisitedHrefs:
            url = self.unvisitedHrefs.popleft()
            #向任务队列分配任务
            self.threadPool.putTask(self._taskHandler, url)
            #标注该链接已被访问,或即将被访问,防止重复访问相同链接
            self.visitedHrefs.add(url)

    def _taskHandler(self, url):
        #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
        webPage = WebPage(url)
        if webPage.fetch():
            self._saveTaskResults(webPage)
            self._addUnvisitedHrefs(webPage)

    def _saveTaskResults(self, webPage):
        url, pageSource = webPage.getDatas()
        try:
            if self.keyword:
                #使用正则的不区分大小写search比使用lower()后再查找要高效率(?)
                if re.search(self.keyword, pageSource, re.I):
                    log.info(
                        'save data: url=%s, pageSource=%s, keyword=%s \n' %
                        (url, pageSource, self.keyword))
                    self.database.saveData(url, pageSource, self.keyword)
            else:
                self.database.saveData(url, pageSource)
        except Exception, e:
            log.error(' URL: %s ' % url + traceback.format_exc())
class CommentCrawler(object):
    
    def __init__(self, group_id, topic_id_list, thread_num, base_path, topic_info_path, comment_info_path):
        """
        `group_id` 当前的Group id
        `topic_id_list` 需要抓取的topic id的list
        `thread_num` 开启的线程数目
        `topic_info_path` 存储topic信息的文件
        `comment_info_path` 存储comment信息的文件
        """
        
        #线程池,指定线程数
        self.thread_pool = ThreadPool(thread_num)

        # 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储
        self.save_thread = ThreadPool(10)
        
        self.topic_info_path = topic_info_path
        self.comment_info_path = comment_info_path
        self.base_path = base_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        # 抓取失败的topic id
        self.failed = set()
        
        
        # 依次为每个小组抽取topic评论
        self.group_id = group_id
        self.topic_id_list = topic_id_list # 等待抓取的topic列表
        
        # 存储结果
        # topic ID ==> Topic对象
        self.topic_dict = dict()
        # 存放下一个处理的评论页数: topic ID ==> 1,2,3...
        self.next_page = dict()
        # 已经抓取完毕的topic id集合
        self.finished = set()

        self.is_crawling = False
        
        # 每个topic抓取的最多comments个数
        #self.MAX_COMMETS_NUM = 5000
        self.MAX_COMMETS_NUM = float('inf')
        
        # 每页的评论数量
        self.COMMENTS_PER_PAGE = 100

    def start(self):
        print '\nStart Crawling comment list for group: ' + self.group_id + '...\n'
        self.is_crawling = True
        self.thread_pool.startThreads()
        self.save_thread.startThreads()
        
        # 打开需要存储的文件
        self.topic_info_file = codecs.open(self.topic_info_path, 'w', 'utf-8')
        self.comment_info_file = codecs.open(self.comment_info_path, 'w', 'utf-8')
        
        self.topic_id_list = list(set(self.topic_id_list)) # 消除重复的topic id
        print "Total topics in group %s: %d." % (self.group_id, len(self.topic_id_list))
        
        # 初始化添加任务
        for topic_id in self.topic_id_list:
            url = "http://www.douban.com/group/topic/" + topic_id + "/"
            self.thread_pool.putTask(self._taskHandler, url)
            # 下一页评论类似:http://www.douban.com/group/topic/35082953/?start=100
            self.next_page[topic_id] = 1
        
        # 完全抛弃之前的抽取深度的概念,改为随时向thread pool推送任务
        while True:
            # 保证任何时候thread pool中的任务数为线程数的2倍
            print "Check threalPool queue..."
            while self.thread_pool.getTaskLeft() < self.thread_pool.threadNum * 2:
                # 获取未来需要访问的链接
                url = self._getFutureVisit()
                if url is not None: 
                    self.thread_pool.putTask(self._taskHandler, url)
                else: # 已经不存在下一个链接
                    break
            # 每隔一秒检查thread pool的队列
            time.sleep(2)
            # 检查是否处理完毕
            if len(self.finished) == len(self.topic_id_list):
                break
            elif len(self.finished) > len(self.topic_id_list):
                assert(False)
            print 'Total topics: %d, Finished topic: %d' % (len(self.topic_id_list), len(self.finished))
            
            remain = set(self.topic_id_list) - self.finished
            if len(remain) < 5:
                print 'Unfinished: ', remain
                
        # 等待线程池中所有的任务都完成
        print "Totally visited: ", len(self.visited_href)
        #pdb.set_trace()
        while self.thread_pool.getTaskLeft() > 0:
            print "Task left in threadPool: ", self.thread_pool.getTaskLeft()
            print "Task queue size: ", self.thread_pool.taskQueue.qsize()
            print "Running tasks: ", self.thread_pool.running
            time.sleep(2)
        
        # 检查保存线程完成情况
        while self.save_thread.getTaskLeft() > 0:
            print "Task left in save thread: ", self.save_thread.getTaskLeft()
            print "Task queue size: ", self.save_thread.taskQueue.qsize()
            print "Running tasks: ", self.save_thread.running
            time.sleep(2)
        
        # 记录抓取失败的topic id
        log.info('抓取失败的topic id:')
        s = ''
        for topic_id in self.failed:
            s += (topic_id + '\n')
        log.info('\n' + s)
        
        print "Terminating all threads..."
        self.stop()
        assert(self.thread_pool.getTaskLeft() == 0)
        
        self.topic_info_file.close()
        self.comment_info_file.close()
        
        print "Main Crawling procedure finished!"
        
        print "Start to save result..."
        #self._saveCommentList()
        #self._saveComment2file()
        log.info("Processing done with group: %s" % (self.group_id))

    def stop(self):
        self.is_crawling = False
        self.thread_pool.stopThreads()
        self.save_thread.stopThreads()
        
    def _taskHandler(self, url):
        """ 根据指定的url,抓取网页,并进行相应的访问控制
        """      
        print "Visiting : " + url
        webPage = WebPage(url)
        
        # 抓取页面内容
        flag = webPage.fetch()
        match_obj = RETopic.match(url)
        match_obj2 = REComment.match(url)
        
        if flag:
            if match_obj is not None:
                topic_id = match_obj.group(1)
                topic = Topic(topic_id, self.group_id)
                comment_list = topic.parse(webPage, isFirstPage = True) # First page parsing
                self.topic_dict[topic_id] = topic
                # 保存到单个文件(已废弃不用)
                #self.save_thread.putTask(self._saveHandler, comment_list, topic = topic)
            elif match_obj2 is not None:
                topic_id = match_obj2.group(1)
                start = int(match_obj2.group(2))
                # 抽取非第一页的评论数据
                if topic_id in self.topic_dict:
                    topic = self.topic_dict[topic_id]
                    if topic is None:
                        log.error('未知程序错误:结束topic id为%s的抽取,释放内存。' % topic_id)
                        self.topic_dict[topic_id] = None
                        return False
                else:
                    # 这里的含义为:必须先处理第一页的评论,否则该topic_id不会作为self.topic_dict的键出现
                    log.error('错误:必须先抽取第一页的评论数据:topic id: %s' % topic_id)
                    self.failed.add(topic_id)
                    self.finished.add(topic_id)
                    return False
                    
                comment_list = topic.parse(webPage, isFirstPage = False) # non-firstpage parsing
                # 保存到单个文件(已废弃不用)
                #self.save_thread.putTask(self._saveHandler, comment_list, topic = None)
            else:
                #pdb.set_trace()
                log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id))
            # 判断抓取是否结束,如果结束,则释放dict内存
            # 这个很重要,因为随着topic数量增多,内存会占很多
            if topic.isComplete():
                self.save_thread.putTask(self._saveTopicHandler, self.topic_dict, topic_id)
                #self.topic_dict[topic_id] = None        # 释放资源
                self.finished.add(topic_id)
                log.info('Topic: %s 抓取结束。' % topic_id)
                
            self.visited_href.add(url)
            return True
        else:
            # 处理抓取失败的网页集合
            # 只要一个网页抓取失败,则加入到finished
            if match_obj is not None:
                # 讨论贴的第一页就没有抓到,则将其列入finished名单中
                topic_id = match_obj.group(1)
            elif match_obj2 is not None:
                topic_id = match_obj2.group(1)
                start = int(match_obj2.group(2))
            else:
                log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id))
            
            # 添加抓取失败的topic id和标记抓取结束的topic
            self.failed.add(topic_id)
            self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息
            self.visited_href.add(url)
            return False
        
    def _saveTopicHandler(self, topic_dict, topic_id):
        """ 存储抓取完毕的帖子信息以及其对应的Comment。
        不过,跟_saveHandler函数不同的是,这里是按照topic id存储
        @topic_dict 存储topic信息的字典
        @topic_id 需要存储的topic id
        """
        # 对评论进行排序,并查找quote comment
        topic = topic_dict[topic_id]
        topic.sortComment()
        
        topic_path = self.base_path + group_id + '/' + topic_id + '-info.txt'
        # 存储topic本身的信息
        f = codecs.open(topic_path, 'w', 'utf-8')
        s = topic.getSimpleString('[=]')
        f.write(s + '\n')
        #f.write('[*ROWEND*]')
        
        # 存储comment信息,存储到相同的文件中
        for comment in topic.comment_list:
            s = comment.getSimpleString('[=]')
            #f.write(s + '\n[*ROWEND*]\n')
            f.write(s + '\n')
        f.close()
        
        self.topic_dict[topic_id] = None        # 释放资源
        log.info("Topic: %s 存储结束" % topic_id)

    def _getFutureVisit(self):
        """根据当前的访问情况,获取下一个要访问的网页
        """
        for topic_id in self.topic_dict:
            if topic_id in self.finished:
                continue
            topic = self.topic_dict[topic_id]
            if topic is None:
                continue
            if topic.max_comment_page <= 0:
                # 还未处理该topic的首页
                continue
            elif topic.max_comment_page == 1:
                # 该topic只有首页有评论
                continue
            else:
                # 该topic有多页评论
                next_start = self.next_page[topic_id]
                url = "http://www.douban.com/group/topic/" + topic_id + "/?start=" + str(next_start * self.COMMENTS_PER_PAGE)
                if next_start <= topic.max_comment_page-1:
                    self.next_page[topic_id] = next_start + 1
                    return url
                else:
                    continue
        
        return None        

    def _getAllHrefsFromPage(self, url, pageSource):
        '''解析html源码,获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a',href=True)
        for a in results:
            #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf 
            #在bs4中不会被自动url编码,从而导致encodeException
            href = a.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)#处理相对链接的问题
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False
        
    def _getAlreadyVisitedNum(self):
        #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedGroups数减去待访问的链接数
        if len(self.visited_href) == 0:
            return 0
        else:
            return len(self.visited_href) - self.thread_pool.getTaskLeft()
    
    '''
class PostIDCrawler(object):

    def __init__(self, start_url, thread_num, post_list_path, max_post_num = 1000):
        """
        `group_id`          待抓取的group id
        `thread_num`         抓取的线程
        `post_list_path`   保存所有的post id list的文件路径
        """
        #线程池,指定线程数
        self.thread_pool = ThreadPool(thread_num)
        # 保存topic的线程
        # NOTE: 这里只允许一个保存进程,因为要操作同一个文件
        self.save_thread = ThreadPool(1)
        
        # 保存group相关信息
        self.post_list_path = post_list_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        #待访问的小组讨论页面
        self.unvisited_href = deque()
        # 访问失败的页面链接
        self.failed_href = set()
        
        self.start_url = start_url
        
        # 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取
        # 只保存thread-id
        self.post_list = list()
        
        self.is_crawling = False
        
        # 每个Group抓取的最大topic个数
        self.MAX_POST_NUM = max_post_num
        #self.MAX_POST_NUM = float('inf')
        # 每一页中显示的最多的topic数量,似乎每页中不一定显示25个topic
        #self.MAX_TOPICS_PER_PAGE = 25

    def start(self):
        print '\nStart crawling post id list...\n'
        self.is_crawling = True
        self.thread_pool.startThreads()
        self.save_thread.startThreads()
        
        # 打开需要存储的文件
        self.post_list_file = codecs.open(self.post_list_path, 'w', 'utf-8')
        
        print "Add start url:", self.start_url
        self.unvisited_href.append(self.start_url)
        
        #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
        self._assignInitTask()
        #等待当前线程池完成所有任务,当池内的所有任务完成时,才进行下一个小组的抓取
        #self.thread_pool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
        while self.thread_pool.getTaskLeft() > 0:
            #print "Task left: ", self.thread_pool.getTaskLeft()
            # 判断是否已经抓了足够多的thread id
            if len(self.post_list) > self.MAX_POST_NUM:
                print u'已经达到最大讨论帖抓取数,即将退出抓取。'
                break
            else:
                print u'当前已抓取的讨论帖个数:', len(self.post_list)
                
            time.sleep(3)

        # 存储抓取的结果并等待存储线程结束
        while self.save_thread.getTaskLeft() > 0:
            print 'Wairting for saving thread. Taks left: %d' % self.save_thread.getTaskLeft()
            time.sleep(3)
        
        log.info("Thread ID list crawling done.")
        
        self.stop()
        # 结束时可能还有任务,但是当前已经抓去了足够量的讨论帖
        #assert(self.thread_pool.getTaskLeft() == 0)
        
        # 关闭文件
        self.post_list_file.close()
        print "Main Crawling procedure finished!"

    def stop(self):
        self.is_crawling = False
        self.thread_pool.stopThreads()
        self.save_thread.stopThreads()

    def _assignInitTask(self):
        """取出一个线程,并为这个线程分配任务,即抓取网页
        """ 
        while len(self.unvisited_href) > 0:
            # 从未访问的列表中抽出一个任务,并为其分配thread
            url = self.unvisited_href.popleft()
            self.thread_pool.putTask(self._taskHandler, url)
            # 添加已经访问过的小组id
            self.visited_href.add(url)
            
    def _taskHandler(self, url):
        """ 根据指定的url,抓取网页,并进行相应的访问控制
        """
        print "Visiting : " + url
        webPage = WebPage(url)
        # 抓取页面内容
        flag = webPage.fetch()
        if flag:
            url, pageSource = webPage.getDatas()
            hrefs = self._getAllHrefsFromPage(url, pageSource)
            # 找到有效的链接
            post_list = []
            next_page_url = None
            for href in hrefs:
                # 只有满足讨论帖链接格式的链接才会被处理
                m = regex_post_first.match(href)
                if self._isHttpOrHttpsProtocol(href) and m is not None:
                    post_list.append(m.group('post_id'))

                # 在当前页面中查找匹配“下一页”的链接
                m = regex_next_page.match(href)
                if m != None and (not m.group() in self.visited_href):
                    url = m.group()
                    print 'Add next page link: ', url
                    self.thread_pool.putTask(self._taskHandler, url)
                    self.visited_href.add(url)
                                
            for post_id in post_list:
                #print "Add thread link: ", thread
                self.post_list.append(post_id)
                
            # 存储已经抓取的topic list
            self.save_thread.putTask(self._saveTopicHandler, post_list)
        else:                
            log.error(u"抓取讨论帖列表时,发现网址格式错误。URL: %s" % url)
            # if page reading fails
            self.failed_href.add(url)
            return False
            
    def _saveTopicHandler(self, post_list):
        """ 将每次从页面中抓取的topic id随时保存到文件中
        NOTE: saveThread只有一个,所以这里不会造成访问冲突
        """
        for tid in post_list:
            self.post_list_file.write(tid + '\n')
            
        self.post_list_file.flush()
        os.fsync(self.post_list_file)

    def _getAllHrefsFromPage(self, url, pageSource):
        '''解析html源码,获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a',href=True)
        for a in results:
            #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf 
            #在bs4中不会被自动url编码,从而导致encodeException
            href = a.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)#处理相对链接的问题
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False
        
    def _getAlreadyVisitedNum(self):
        #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedGroups数减去待访问的链接数
        if len(self.visited_href) == 0:
            return 0
        else:
            return len(self.visited_href) - self.thread_pool.getTaskLeft()
示例#9
0
class Crawler(object):
    def __init__(self, args):
        self.depth = args.depth
        self.currentDepth = 1
        self.database = database(args.dbFile)
        self.threadPool = ThreadPool(args.threadNum)
        self.visitUrls = set()
        self.unvisitedUrls = deque()
        self.unvisitedUrls.append(args.url)
        self.isCrawling = False
        self.maxWebPages = args.maxWebPages

    def requestPage(self, url, retry=2):
        try:
            h = self.customerHeader(url)
            content = requests.get(url, headers=h, timeout=10)
            self.handleEncoding(content)
            if content.status_code == requests.codes.ok:
                if 'html' in content.headers['Content-Type']:
                    return content.text
            log.warning('Page not available. Status code:%d URL:%s\n' %
                        (content.status_code, url))
        except Exception:
            if retry > 0:
                return self.requestPage(url, retry - 1)
            else:
                log.debug('request Fail URL:%s' % url)
        return None

    def extractUrls(self, content, url):
        allUrls = self.getAllHrefs(url, content)
        for href in allUrls:
            if self.isHttpProtocol(href):
                if href not in self.visitUrls and href not in self.unvisitedUrls:
                    self.unvisitedUrls.append(href)

    def saveResult(self, content, url):
        self.database.saveWeb(url, content)

    def taskHandler(self, url):
        content = self.requestPage(url)
        self.saveResult(content, url)
        self.extractUrls(content, url)

    def assignTask(self):
        while self.unvisitedUrls:
            url = self.unvisitedUrls.popleft()
            self.threadPool.putTask(self.taskHandler, url)
            self.visitUrls.add(url)

    def start(self):
        print '\n Start Crawling\n'
        if self.database.con:
            self.isCrawling = True
            self.threadPool.startThreads()
            while self.currentDepth < self.depth + 1:
                self.assignTask()
                while self.threadPool.getAllTaskCount():
                    time.sleep(4)
                print 'Depth %d Finish' % self.currentDepth
                print 'Totally crawled %d links' % len(self.visitUrls)
                log.info('Depth %d Finish. Totally crawled %d links' %
                         (self.currentDepth, len(self.visitUrls)))
                if len(self.visitUrls) > self.maxWebPages:
                    break
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()
        self.database.close()

    def customerHeader(self, url):
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset': 'gb18030,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding': 'gzip,deflate,sdch',
            'Accept-Language': 'en-US,en;q=0.8',
            'Connection': 'keep-alive',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4',
            'Referer': url,
        }
        return headers

    def getAllHrefs(self, url, content):
        hrefs = []
        s = bs(content)
        res = s.findAll('a', href=True)
        for r in res:
            href = r.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)
            hrefs.append(href)
        return hrefs

    def isHttpProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False

    def handleEncoding(self, response):
        if response.encoding == 'ISO-8859-1':
            charset_re = re.compile("((^|;)\s*charset\s*=)([^\"']*)", re.M)
            charset = charset_re.search(response.text)
            charset = charset and charset.group(3) or None
            response.encoding = charset
示例#10
0
class Fetch(object):

    def __init__(self,url,threadnum,limit):
        #self.database = Database('pichref.sql')
        self.file = PicFile('imgfile','a')
        self.threadPool = ThreadPool(threadnum)
        self.unaccesshref = deque()#双向列表
        self.accessedhref = set()#已访问的链接集合
        self.unaccesshref.append(url)#添加初始链接
        self.limit = limit
        self.picUrlCount = 1
        


    def start(self):
        print '--start downloading url--'
        self.threadPool.startThreads()
        while self.unaccesshref!=deque([]):#不为空 一直分配任务
            self._organise()
            print '---'

        self.stop()

    def stop(self):
        self.threadPool.stopThreads()
        self.file.close()
        print '--Stop downloading url--'

    #往线程池分配任务
    def _organise(self):
        while self.unaccesshref:
            url = self.unaccesshref.popleft()#从双向队列左取URL
            #print 'popleft sucess'
            self.threadPool.putTask(self._handle_task,url)#分配任务
            self.accessedhref.add(url)#添加到已处理
            time.sleep(2)#中断操作,让unaccesshref可以及时得到数据

        print 'accessedhref',self.accessedhref
        print 'unaccesshref',self.unaccesshref


    #处理任务
    def _handle_task(self,url):
        webpage = DownloadWeb(url)
        if webpage.download():
            self._addUrlToUnaccesshref(webpage)



    #添加普通链接到未访问链接列表
    def _addUrlToUnaccesshref(self,webpage):
        url, webpagecontent = webpage.getdata()
#        pic_links, hrefs = self._getLinkFromPage(url,webpagecontent)
        hrefs = self._getLinkFromPage(url,webpagecontent)


        for href in hrefs:
            if not self._isUsedhref(href):
                self.unaccesshref.append(href)
#        print 'self.unaccesshref',len(self.unaccesshref),self.unaccesshref,'\n'

    
    #解析源码,获取普通链接和图片链接,将正确的图片链接存储到文件
    def _getLinkFromPage(self,url,source_code):
        pic_links, hrefs = [], []
        soup = BeautifulSoup(source_code)
        href_res = soup.find_all('a',href=True)#获取普通链接
        pic_link_res = soup.find_all(src=re.compile('http://.*?\.jpg'))#获取图片链接
        for h in href_res:
            href = h.get('href').encode('utf-8')
            if not href.startswith('http') and href!='/' and href!='#' and href.find(';')==-1:
                href = urljoin(url, href)
                hrefs.append(href)

        for pic_link in pic_link_res:
            pic_link = pic_link.get('src').encode('utf-8')
            self.file.saveData(pic_link)#图片链接存储到文件
            
            self.picUrlCount+=1
            if self.picUrlCount >= self.limit:
#                print self.limit,'limit ------'
#                self.stop()
                ##由于线程池是当self.unaccesshref为空时才结束,当链接数满足要求,结束线程池的工作
                self.unaccesshref=deque()
                return []#
        return hrefs

    '''判断是否是已经获取的 url'''
    def _isUsedhref(self,href):
        if href in self.accessedhref or href in self.unaccesshref:
            return True
        else:
            return False
示例#11
0
class Crawler(object):

    def __init__(self, args):
        self.depth = args.depth
        self.currentDepth = 1
        self.database = database(args.dbFile)
        self.threadPool = ThreadPool(args.threadNum)
        self.visitUrls = set()
        self.unvisitedUrls = deque()
        self.unvisitedUrls.append(args.url)
        self.isCrawling = False
        self.maxWebPages = args.maxWebPages

    def requestPage(self, url, retry=2):
        try:
            h = self.customerHeader(url)
            content = requests.get(url, headers=h, timeout=10)
            self.handleEncoding(content)
            if content.status_code == requests.codes.ok:
                if 'html' in content.headers['Content-Type']:
                    return content.text
            log.warning('Page not available. Status code:%d URL:%s\n' % (content.status_code, url))
        except Exception:
            if retry > 0:
                return self.requestPage(url, retry-1)
            else:
                log.debug('request Fail URL:%s' % url)
        return None

    def extractUrls(self, content, url):
        allUrls = self.getAllHrefs(url, content)
        for href in allUrls:
            if self.isHttpProtocol(href):
                if href not in self.visitUrls and href not in self.unvisitedUrls:
                    self.unvisitedUrls.append(href)

    def saveResult(self, content, url):
        self.database.saveWeb(url, content)

    def taskHandler(self, url):
        content = self.requestPage(url)
        self.saveResult(content, url)
        self.extractUrls(content, url)

    def assignTask(self):
        while self.unvisitedUrls:
            url = self.unvisitedUrls.popleft()
            self.threadPool.putTask(self.taskHandler, url)
            self.visitUrls.add(url)

    def start(self):
        print '\n Start Crawling\n'
        if self.database.con:
            self.isCrawling = True
            self.threadPool.startThreads()
            while self.currentDepth < self.depth+1:
                self.assignTask()
                while self.threadPool.getAllTaskCount():
                    time.sleep(4)
                print 'Depth %d Finish' % self.currentDepth
                print 'Totally crawled %d links' % len(self.visitUrls)
                log.info('Depth %d Finish. Totally crawled %d links' % (self.currentDepth, len(self.visitUrls)))
                if len(self.visitUrls) > self.maxWebPages:
                    break
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()
        self.database.close()

    def customerHeader(self, url):
        headers = {
            'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Charset' : 'gb18030,utf-8;q=0.7,*;q=0.3',
            'Accept-Encoding' : 'gzip,deflate,sdch',
            'Accept-Language' : 'en-US,en;q=0.8',
            'Connection': 'keep-alive',
            'User-Agent' : 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4',
            'Referer' : url,
        }
        return headers

    def getAllHrefs(self, url, content):
        hrefs = []
        s = bs(content)
        res = s.findAll('a', href=True)
        for r in res:
            href = r.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)
            hrefs.append(href)
        return hrefs

    def isHttpProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False

    def handleEncoding(self, response):
        if response.encoding == 'ISO-8859-1':
            charset_re = re.compile("((^|;)\s*charset\s*=)([^\"']*)", re.M)
            charset=charset_re.search(response.text)
            charset=charset and charset.group(3) or None
            response.encoding = charset
示例#12
0
class Crawler(object):

    def __init__(self, args):
        #指定网页深度
        self.depth = args.depth  
        #标注初始爬虫深度,从1开始
        self.currentDepth = 1  
        #指定关键词,使用console的默认编码来解码
        self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
        #数据库
        self.database =  Database(args.dbFile)
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)  
        #已访问的链接
        self.visitedHrefs = set()   
        #待访问的链接 
        self.unvisitedHrefs = deque()    
        #添加首个待访问的链接
        #self.unvisitedHrefs.append(args.url) 
        #标记爬虫是否开始执行任务
        self.isCrawling = False

        self.domainPattern = re.compile(r"^([0-9a-zA-Z][0-9a-zA-Z-]{0,62}\.)+([0-9a-zA-Z][0-9a-zA-Z-]{0,62})\.?$")
        self.maxDomainSeeds = args.maxDomainSeeds
        self._initDomainSeedsList(args.domainSeeds)

    def _initDomainSeedsList(self, domainSeedsFile):
        fp = open(domainSeedsFile, 'r+')
        urlList = fp.readlines()
        for url in urlList:
            formattedUrl = self._formatUrl(url)
            if len(formattedUrl) > 0 and len(self.unvisitedHrefs) <= self.maxDomainSeeds:
                self.unvisitedHrefs.append(formattedUrl)
        fp.close()
        print 'We have got %d domain feeds.'%len(self.unvisitedHrefs)

    def _formatUrl(self, rawValue):
        rawValueStr = rawValue.strip().strip('\n')
        if len(rawValueStr) <= 0:
            return '' 
        if not self.domainPattern.match(rawValueStr):
            return ''
        if not rawValueStr.startswith('http'):
            value = 'http://' + rawValueStr
        else:
            value = rawValueStr
        return value

    def start(self):
        print '\nStart Crawling\n'
        if not self._isDatabaseAvaliable():
            print 'Error: Unable to open database file.\n'
        else:
            self.isCrawling = True
            self.threadPool.startThreads() 
            while self.currentDepth < self.depth+1:
                #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
                self._assignCurrentDepthTasks ()
                #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
                #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
                while self.threadPool.getTaskLeft():
                    time.sleep(8)
                print 'Depth %d Finish. Totally visited %d links. \n' % (
                    self.currentDepth, len(self.visitedHrefs))
                log.info('Depth %d Finish. Total visited Links: %d\n' % (
                    self.currentDepth, len(self.visitedHrefs)))
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()

    def getAlreadyVisitedNum(self):
        #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
        return len(self.visitedHrefs) - self.threadPool.getTaskLeft()

    def _assignCurrentDepthTasks(self):
        mylock.acquire()
        copiedUnvisitedHrefs = deque()
        while self.unvisitedHrefs:
            copiedUnvisitedHrefs.append(self.unvisitedHrefs.popleft())
        mylock.release()
        while copiedUnvisitedHrefs:
            url = copiedUnvisitedHrefs.popleft()
            #标注该链接已被访问,或即将被访问,防止重复访问相同链接
            self.visitedHrefs.add(url)
            #向任务队列分配任务
            self.threadPool.putTask(self._taskHandler, url)
 
    def _taskHandler(self, url):
        #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
        webPage = WebPage(url)
        retry =1
        if webPage.fetch(retry):
            print 'Visited URL : %s ' % url
            self._saveTaskResults(webPage)
            self._addUnvisitedHrefs(webPage)

    def _saveTaskResults(self, webPage):
        url, pageSource = webPage.getDatas()
        soup = BeautifulSoup(pageSource)
        image_tags = soup.find_all('img', src=True)
        log.error('pageSource %s' %pageSource)
        for image_tag in image_tags:
            log.error('image_tag %s' %image_tag.contents)
            image_tag_parent = image_tag.find_parent('a')
            if not image_tag_parent == None:
                targetURL = image_tag_parent.get('href').encode('utf8')
                if not targetURL.startswith('http'):
                    targetURL = urljoin(url, targetURL)#处理相对链接的问题
                adsURL = image_tag.get('src').encode('utf8') 
                if not adsURL.startswith('http'):
                    adsURL = urljoin(url, adsURL)#处理相对链接的问题
                print "We got an ads"
                print "adsURL %s" %adsURL
                print "targetURL %s" %targetURL
                print "referURL %s" %url
                print get_tld(adsURL)
                print get_tld(targetURL)
                print get_tld(url)
                log.error("adsURL %s" %adsURL)
                log.error("targetURL %s" %targetURL)
                log.error("referURL %s" %url)
            try:
                self.database.saveData(adsURL, targetURL, url)
            except Exception, e:
                 log.error(' URL: %s ' % url + traceback.format_exc())
示例#13
0
class Crawler(object):

    def __init__(self, args):
        #指定网页深度
        self.depth = args.depth
        #标注初始爬虫深度,从1开始
        self.currentDepth = 1
        #指定关键词,使用console的默认编码来解码
        self.keyword = args.keyword.decode(getdefaultlocale()[1])
        #数据库
        self.database =  Database(args.dbFile)
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)
        #已访问的链接
        self.visitedHrefs = set()
        #待访问的链接
        self.unvisitedHrefs = deque()
        #添加首个待访问的链接
        self.unvisitedHrefs.append(args.url)
        #标记爬虫是否开始执行任务
        self.isCrawling = False

    def start(self):
        print '\nStart Crawling\n'
        if not self._isDatabaseAvailable():
            print 'Error: Unable to open database file.\n'
        else:
            self.isCrawling = True
            self.threadPool.startThreads()
            while self.currentDepth < self.depth+1:
                #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
                self._assignCurrentDepthTasks ()
                #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
                #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
                while self.threadPool.getTaskLeft():
                    time.sleep(8)
                print 'Depth %d Finish. Totally visited %d links. \n' % (
                    self.currentDepth, len(self.visitedHrefs))
                log.info('Depth %d Finish. Total visited Links: %d\n' % (
                    self.currentDepth, len(self.visitedHrefs)))
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()

    def getAlreadyVisitedNum(self):
        #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
        return len(self.visitedHrefs) - self.threadPool.getTaskLeft()

    def _assignCurrentDepthTasks(self):
        while self.unvisitedHrefs:
            url = self.unvisitedHrefs.popleft()
            #向任务队列分配任务
            self.threadPool.putTask(self._taskHandler, url)
            #标注该链接已被访问,或即将被访问,防止重复访问相同链接
            self.visitedHrefs.add(url)

    def _taskHandler(self, url):
        #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
        webPage = WebPage(url)
        if webPage.fetch():
            self._saveTaskResults(webPage)
            self._addUnvisitedHrefs(webPage)

    def _saveTaskResults(self, webPage):
        url, pageSource = webPage.getDatas()
        try:
            if self.keyword:
                #使用正则的不区分大小写search比使用lower()后再查找要高效率(?)
                if re.search(self.keyword, pageSource, re.I):
                    self.database.saveData(url, pageSource, self.keyword)
            else:
                self.database.saveData(url, pageSource)
        except Exception, e:
            log.error(' URL: %s ' % url + traceback.format_exc())
示例#14
0
class Crawler(object):
    def __init__(self, args):
        #指定网页深度
        self.depth = args.depth
        #表示爬虫深度,从1开始
        self.currentDepth = 1
        #数据库
        self.database = Database(args.dbFile)
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)
        #已经访问的链接
        self.visitedHrefs = set()
        #待访问的页面
        self.unvisitedHrefs = deque()
        #首个待访问的页面
        self.url = args.url
        self.unvisitedHrefs.append(args.url)
        #标记爬虫是否开始执行
        self.isCrawling = False

    def isDatabaseAvaliable(self):
        if self.database.isConn():
            return True
        return False

    def _saveTaskResults(self, my_web):
        #只过滤包含正文的网页
        str = '.*\w{16}\.((html)|(shtml))'
        url, pageSource = my_web.getDatas()
        r = re.search(str, url)
        if r is not None:
            soup = BeautifulSoup(pageSource)
            if soup.h2 is not None:
                title = unicode(soup.h2.string)
            elif soup.p is not None:
                title = unicode(soup.p.string)
            else:
                title = 'no title'
            text = ''
            for i in soup.find_all('p'):
                text += unicode(i.get_text())
            #tmp = trieKmp.gao(title + text)
            t1 = trieKmp.gao(title)
            t2 = trieKmp.gao(text)
            tmp = []
            for i in xrange(len(t1)):
                if t1[i] != '0':
                    tmp.append('9')
                else:
                    tmp.append(t2[i])
            res = ''.join(tmp)
            #print 'res=', res
            # print 'text=', text, 'tmp=', tmp
            # print 'tmp=', tmp
            self.database.saveData(url, title, text[:40], res)
        return 0

    def _getAllHrefsFromPage(self, url, pageSource):
        '''用beautifulsoup解析源码,得到有效连接'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a', href=True)
        for a in results:
            #防止中文连接,encode转为utf8
            href = a.get('href').encode('utf8')
            if not href.strip().startswith('http'):  #去除前后多余的空格
                href = urljoin(url, href)
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        '''只处理http,https连i接'''
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            if not (self.url in href):
                return False
            if '.jpg' in href:
                return False
            return True
        return False

    def _isHrefRepeated(self, href):
        '''去掉重复的网页'''
        if href in self.visitedHrefs or href in self.unvisitedHrefs:
            return True
        return False

    def _addUnvisitedHrefs(self, my_web):
        '''添加未访问连接'''
        url, pageSource = my_web.getDatas()
        hrefs = self._getAllHrefsFromPage(url, pageSource)
        for href in hrefs:
            if self._isHttpOrHttpsProtocol(href):
                if not self._isHrefRepeated(href):
                    self.unvisitedHrefs.append(href)

    def getAlreadyVisitedNum(self):
        '''获得已经访问的网页的数目'''
        return len(self.visitedHrefs) - self.threadPool.getTaskLeft()

    def _taskHandler(self, url):
        '''以_开头的函数是放在队列里供线程提取用的'''
        my_web = WebPage(url)
        #print 'F**k', my_web.fetch()
        if my_web.fetch():
            #print 'has visited %s' % url
            self._saveTaskResults(my_web)
            self._addUnvisitedHrefs(my_web)

    def _assignCurrentDepthTasks(self):
        '''分配任务,该操作不阻塞'''
        while self.unvisitedHrefs:
            url = self.unvisitedHrefs.popleft()
            #分配给任务队列
            self.threadPool.putTask(self._taskHandler, url)
            self.visitedHrefs.add(url)

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()

    def start(self):
        print '\nstart crawling', self.url
        self.isCrawling = True
        self.threadPool.startThreads()
        while self.currentDepth < self.depth + 1:
            #分配任务(该操作不阻塞)
            self._assignCurrentDepthTasks()
            #等待该层任务结束
            #print 'sssssss'
            #self.threadPool.taskJoin()
            while self.threadPool.getTaskLeft():
                #print self.threadPool.taskQueue.qsize()
                time.sleep(8)
            #print 'eeeeee'
            print 'depth %d finished. totally visited %d links.\n' % (
                self.currentDepth, len(self.visitedHrefs))
            log.info('depth %d finished. totally visited %d links.\n' %
                     (self.currentDepth, len(self.visitedHrefs)))
            self.currentDepth += 1
        self.stop()

    def selfTesting(self):
        url = 'http://www.baidu.com'
        print '\nVisiting www.baidu.com using directly'
        my_web = WebPage(url)
        pageSource = my_web.fetch()
        #测试网络链接
        if pageSource == None:
            print 'please check your network'
        elif not self.isDatabaseAvaliable():
            print 'please make sure you have the permission to save data: %s\n' % args.dbFile
        else:
            self._saveTaskResults(my_web)
            print 'save data successfully'
            print 'seems all is ok'
示例#15
0
class Crawler(threading.Thread):

    def __init__(self, args, queue):
        threading.Thread.__init__(self)
        #指定网页深度
        self.depth = args['depth']
        #标注初始爬虫深度,从1开始
        self.currentDepth = 1
        #指定关键词,使用console的默认编码来解码
        self.keyword = args['keyword'].decode(getdefaultlocale()[1])
        #数据库
        self.database =  Database(db="bt_tornado")
        #线程池,指定线程数
        self.threadPool = ThreadPool(args['threadNum'])
        #已访问的链接
        self.visitedHrefs = set()
        #待访问的链接
        self.unvisitedHrefs = deque()
        #添加待访问的链接
        for url in args['url']:
            self.unvisitedHrefs.append(url)
        #标记爬虫是否开始执行任务
        self.isCrawling = False
        # allow or deny crawl url
        self.entryFilter = args['entryFilter']
        # allow to output back url
        self.yieldFilter = args['yieldFilter']
        #
        self.callbackFilter = args['callbackFilter']
        #
        self.db = args['db']
        self.collection = args['collection']
        # communication queue
        self.queue = queue

    def run(self):
        print '\nStart Crawling\n'
        if not self._isDatabaseAvaliable():
            print 'Error: Unable to open database file.\n'
        else:
            self.isCrawling = True
            self.threadPool.startThreads()
            while self.currentDepth < self.depth+1:
                #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
                self._assignCurrentDepthTasks ()
                #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
                #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
                while self.threadPool.getTaskLeft():
                    time.sleep(8)
                print 'Depth %d Finish. Totally visited %d links. \n' % (
                    self.currentDepth, len(self.visitedHrefs))
                log.info('Depth %d Finish. Total visited Links: %d\n' % (
                    self.currentDepth, len(self.visitedHrefs)))
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()
        #use queue to communicate between threads
        self.queue.get()
        self.queue.task_done()

    def getAlreadyVisitedNum(self):
        #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
        return len(self.visitedHrefs) - self.threadPool.getTaskLeft()

    def _assignCurrentDepthTasks(self):
        while self.unvisitedHrefs:
            url = self.unvisitedHrefs.popleft()
            if not self.__entry_filter(url):
                self.visitedHrefs.add(url)
                continue
            #向任务队列分配任务
            self.threadPool.putTask(self._taskHandler, url)
            #标注该链接已被访问,或即将被访问,防止重复访问相同链接
            self.visitedHrefs.add(url)

    def _callback_filter(self, webPage):
        #parse the web page to do sth
        url , pageSource = webPage.getDatas()
        for tmp  in self.callbackFilter['List']:
            if re.compile(tmp,re.I|re.U).search(url):
                self.callbackFilter['func'](webPage)

    def _taskHandler(self, url):
        #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
        webPage = WebPage(url)
        tmp = webPage.fetch()
        if tmp:
            self._callback_filter(webPage)
            self._saveTaskResults(webPage)
            self._addUnvisitedHrefs(webPage)

    def _saveTaskResults(self, webPage):
        url, pageSource = webPage.getDatas()
        _id = md5(url).hexdigest()
        try:
            if self.__yield_filter(url):
                query = {"id": _id}
                document = {"id": _id, "url":url, "createTime": datetime.now()}
                self.database.saveData(query=query, collection=self.collection, document=document)
        except Exception, e:
            log.error(' URL: %s ' % url + traceback.format_exc())
示例#16
0
class Crawler(object):

    def __init__(self, args):
        self.depth = args.depth  
        self.currentDepth = 1  
        self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
        self.database =  Database(args.dbFile)
        self.threadPool = ThreadPool(args.threadNum)  
        self.visitedHrefs = set()   
        self.unvisitedHrefs = deque()    
        self.unvisitedHrefs.append(args.url) 
        self.isCrawling = False

    def start(self):
        print '\nStart Crawling\n'
        if not self._isDatabaseAvaliable():
            print 'Error: Unable to open database file.\n'
        else:
            self.isCrawling = True
            self.threadPool.startThreads() 
            while self.currentDepth < self.depth+1:
                self._assignCurrentDepthTasks ()
                #self.threadPool.taskJoin()Ctrl-C Interupt
                while self.threadPool.getTaskLeft():
                    time.sleep(8)
                print 'Depth %d Finish. Totally visited %d links. \n' % (
                    self.currentDepth, len(self.visitedHrefs))
                log.info('Depth %d Finish. Total visited Links: %d\n' % (
                    self.currentDepth, len(self.visitedHrefs)))
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()

    def getAlreadyVisitedNum(self):
        #visitedHrefstaskQueue
        visitedHrefs
        return len(self.visitedHrefs) - self.threadPool.getTaskLeft()

    def _assignCurrentDepthTasks(self):
        while self.unvisitedHrefs:
            url = self.unvisitedHrefs.popleft()
            self.threadPool.putTask(self._taskHandler, url)   
            self.visitedHrefs.add(url)  
 
    def _taskHandler(self, url):
        webPage = WebPage(url)
        if webPage.fetch():
            self._saveTaskResults(webPage)
            self._addUnvisitedHrefs(webPage)

    def _saveTaskResults(self, webPage):
        url, pageSource = webPage.getDatas()
        try:
            if self.keyword:
                if re.search(self.keyword, pageSource, re.I):
                    self.database.saveData(url, pageSource, self.keyword) 
            else:
                self.database.saveData(url, pageSource)
        except Exception, e:
            log.error(' URL: %s ' % url + traceback.format_exc())
class CommentCrawler(object):
    
    def __init__(self, section_id, post_id_list, crawler_thread_num, save_thread_num, post_base_path):
        """
        `section_id` 天涯的板块名称
        `post_id_list` 需要抓取的post id的list
        `thread_num` 开启的线程数目
        post_base_path: 存储抓取结果的基本目录,每个post一个文件,并以该post的ID命名
        """
        # 抓取网页的线程池,指定线程数
        self.thread_pool = ThreadPool(crawler_thread_num)
        # 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储
        self.save_thread = ThreadPool(save_thread_num)
        
        # 保存抓取信息的base path
        self.base_path = post_base_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        self.visited_post = set() # 已经添加访问的页面的id集合
        self.finished = set() # 已经抓取完毕的topic id集合
        
        # 抓取失败的topic id
        self.failed = set()
        
        # 依次为每个小组抽取topic评论
        self.section_id = section_id
        self.post_id_list = post_id_list # 等待抓取的topic列表
        self.current_post_id_list = list(post_id_list) # 用于逐步向任务列表中加入post id
        
        # 存储结果
        # topic ID ==> Topic对象
        self.post_dict = dict()
        # 存放下一个处理的评论页数: topic ID ==> 1,2,3...
        self.next_page = dict()

        self.is_crawling = False
        
        # 每个topic抓取的最多comments个数
        #self.MAX_COMMETS_NUM = 1000
        self.MAX_COMMETS_NUM = float('inf')

    def start(self):
        print '\nStart Crawling comment list for group: ' + self.section_id + '...\n'
        self.is_crawling = True
        self.thread_pool.startThreads()
        self.save_thread.startThreads()
        
        self.post_id_list = list(set(self.post_id_list)) # 消除重复的topic id
        print u"Total number of post in section %s: %d." % (self.section_id, len(self.post_id_list))
        
        # 先为字典建立所有的key,避免出现“RuntimeError: dictionary changed size during iteration”错误
        for post_id in self.post_id_list:
            self.post_dict[post_id] = None
        
        # 初始化添加一部分post的id到列表
        for i in xrange(self.thread_pool.threadNum * 2):
            # TODO: 这里的URL模式只是针对“天涯杂谈”部分的链接
            if len(self.current_post_id_list) > 0:
                post_id = self.current_post_id_list.pop()
                url = "http://bbs.tianya.cn/post-%s-%s-1.shtml" % (self.section_id, post_id)
                self.thread_pool.putTask(self._taskHandler, url)
        
        # 完全抛弃之前的抽取深度的概念,改为随时向thread pool推送任务
        while True:
            # 保证任何时候thread pool中的任务数最少为线程数的2倍
            print "Check threalPool queue..."
            while self.thread_pool.getTaskLeft() < self.thread_pool.threadNum * 2:
                # 获取未来需要访问的链接
                url = self._getFutureVisit()
                if url is not None: 
                    self.thread_pool.putTask(self._taskHandler, url)
                else: # 已经不存在下一个链接
                    #print 'No future visit url.'
                    break
            # 每隔一秒检查thread pool的队列
            time.sleep(2)
            # 检查是否处理完毕
            if len(self.finished) == len(self.post_id_list):
                break
            elif len(self.finished) > len(self.post_id_list):
                assert(False)
                
            print 'Number of task in LIFO queue: ', self.thread_pool.taskQueue.qsize()
            print 'Number of task in save queue: ', self.save_thread.taskQueue.qsize()
            print 'Total posts: %d, Finished topic: %d' % (len(self.post_id_list), len(self.finished))
                
        # 等待线程池中所有的任务都完成
        print "Totally visited: ", len(self.visited_href)
        #pdb.set_trace()
        while self.thread_pool.getTaskLeft() > 0:
            print "Task left in threadPool: ", self.thread_pool.getTaskLeft()
            print "Task queue size: ", self.thread_pool.taskQueue.qsize()
            print "Running tasks: ", self.thread_pool.running
            time.sleep(2)
        
        # 检查保存线程完成情况
        while self.save_thread.getTaskLeft() > 0:
            print "Task left in save thread: ", self.save_thread.getTaskLeft()
            print "Task queue size: ", self.save_thread.taskQueue.qsize()
            print "Running tasks: ", self.save_thread.running
            time.sleep(2)
        
        # 记录抓取失败的topic id
        log.info(u'抓取失败的post id:')
        s = ''
        for post_id in self.failed:
            s += (post_id + '\n')
        log.info('\n' + s)
        
        print "Terminating all threads..."
        self.stop()
        assert(self.thread_pool.getTaskLeft() == 0)
        
        print "Main Crawling procedure finished!"
        log.info("Processing done with tianya section: %s" % (self.section_id))

    def stop(self):
        self.is_crawling = False
        self.thread_pool.stopThreads()
        self.save_thread.stopThreads()
        
    def _taskHandler(self, url):
        """ 根据指定的url,抓取网页,并进行相应的访问控制
        """      
        print "Visiting : " + url
        webPage = WebPage(url)
        
        # 抓取页面内容
        flag = webPage.fetch()
        m = regex_post.match(url)
        if m == None:
            log.info(u'Post链接格式错误:%s in Group: %s.' % (url, self.section_id))
            return True
        else:
            log.info(u'访问:' + url)
            
        comment_page_index = int(m.group('page_index'))
        post_id = m.group('post_id')
        if flag:
            if comment_page_index == 1: # 首页评论
                post = Post(post_id, self.section_id)
                # 解析讨论帖的第一个页:包括原帖内容和评论内容
                comment_list = post.parse(webPage, isFirstPage = True) # First page parsing
                self.post_dict[post_id] = post
                self.next_page[post_id] = 2
                
            elif comment_page_index > 1:
                # 抽取非第一页的评论数据
                if post_id in self.post_dict:
                    post = self.post_dict[post_id]
                else:
                    # 这里的含义为:必须先处理第一页的评论,否则该post_id不会作为self.topic_dict的键出现
                    log.error(u'错误:必须先抽取第一页的评论数据:post id: %s' % post_id)
                    self.failed.add(topic_id)
                    self.finished.add(topic_id)
                    return False
                
                if post is None:
                    log.error(u'未知程序错误:结束post id为%s的抽取,释放内存。' % post_id)
                    self.post_dict[post_id] = post
                    return False
                    
                comment_list = post.parse(webPage, isFirstPage = False) # non-firstpage parsing
            else:
                log.info(u'Post链接格式错误:%s in Group: %s.' % (url, self.section_id))

            # 判断抓取是否结束,如果结束,则释放dict内存
            # 这个很重要,因为随着topic数量增多,内存会占很多
            if post.is_complete():
                self.save_thread.putTask(self._saveTopicHandler, self.post_dict, post_id)
                self.finished.add(post_id)
                log.info(u'Topic: %s 抓取结束。' % post_id)
                
            self.visited_href.add(url)
            return True
        else:
            # 处理抓取失败的网页集合,只要一个网页抓取失败,则加入到finished
            # 添加抓取失败的post id和标记抓取结束的post
            self.failed.add(post_id)
            self.finished.add(post_id) # 有可能已经记录了一些某些topic的信息
            self.visited_href.add(url)
            return False

    def _getFutureVisit(self):
        """根据当前的访问情况,获取下一个要访问的网页
        """
        # 先检查当前正在抓取的所有帖子,目标是尽快将其抓去完并保存
        for post_id in self.post_dict:
            if post_id in self.finished:
                continue
            post = self.post_dict[post_id]
            
            if post is None:
                continue
                
            if post.total_comment_page <= 0:
                # 还未处理该topic的首页
                continue
            elif post.total_comment_page == 1:
                # 该topic只有首页有评论
                continue
            else:
                # 该topic有多页评论
                next_page_index = self.next_page[post_id]
                if next_page_index > post.total_comment_page:
                    continue
                else:
                    url = "http://bbs.tianya.cn/post-free-%s-%d.shtml" % (post_id, next_page_index)
                    self.next_page[post_id] = next_page_index + 1
                    return url
                
        # 如果当前正在处理的帖子全部已经抓取完毕,则加入新帖子post_id
        if len(self.current_post_id_list) > 0:
            post_id = self.current_post_id_list.pop()
            url = "http://bbs.tianya.cn/post-%s-%s-1.shtml" % (self.section_id, post_id)
            return url
        else:
            return None
    
    def _saveTopicHandler(self, post_dict, post_id):
        """ 存储抓取完毕的帖子信息以及其对应的Comment。
        不过,跟_saveHandler函数不同的是,这里是按照topic id存储
        post_dict 存储topic信息的字典
        post_id 需要存储的post id
        
        NOTE: 因为随时可能被ctrl+C终止,而此时可能有些帖子的内容还没有保存完成。
        """
        #TODO: 添加SIGINT handler
        # 在保存结果钱,对评论进行排序,并查找quote comment        
        post = post_dict[post_id]
        post.sort_comment()
        
        post_path = self.base_path + self.section_id + '/' + post_id + '-info.txt'
        # 存储topic本身的信息
        f = codecs.open(post_path, 'w', 'utf-8')
        s = post.get_simple_string('[=]')
        f.write(s + '\n')
        
        # 存储comment信息,存储到相同的文件中
        for comment in post.comment_list:
            s = comment.get_simple_string('[=]')
            f.write(s + '\n')
        f.close()
        
        # 释放资源
        # NOTE: del self.post_dict[post_id]不能达到效果,如果需要根据post_id是否在
        # self.post_dict中来判断是否已经抓取该帖子
        self.post_dict[post_id] = None
        self.next_page[post_id] = None
        
        log.info(u"Topic: %s 存储结束。" % post_id)

    def _getAllHrefsFromPage(self, url, pageSource):
        '''解析html源码,获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a',href=True)
        for a in results:
            #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf 
            #在bs4中不会被自动url编码,从而导致encodeException
            href = a.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)#处理相对链接的问题
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False
示例#18
0
class Crawler(object):

    def __init__(self, args, startURLs):
        #指定网页深度
        self.depth = args.depth  
        #标注初始爬虫深度,从1开始
        self.currentDepth = 1  
        #指定关键词,使用console的默认编码来解码
        #self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
        #数据库
        self.database =  Database(args.dbFile)
        # store group ids to fils, using UTF-8
        self.groupfile = codecs.open("GroupID.txt", "w", "UTF-8")
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)  
        #已访问的小组id
        self.visitedGroups = set()   
        #待访问的小组id
        self.unvisitedGroups = deque()
        
        # 所有的Group信息
        self.groupInfo = []
        
        self.lock = Lock() #线程锁

        #标记爬虫是否开始执行任务
        self.isCrawling = False
        # 添加尚未访问的小组首页
        for url in startURLs:
            match_obj = REGroup.match(url)
            print "Add start urls:", url
            assert(match_obj != None)
            self.unvisitedGroups.append(match_obj.group(1))
        
        # 一分钟内允许的最大访问次数
        self.MAX_VISITS_PER_MINUTE = 10
        # 当前周期内已经访问的网页数量
        self.currentPeriodVisits = 0
        # 将一分钟当作一个访问周期,记录当前周期的开始时间
        self.periodStart = time.time() # 使用当前时间初始化

    def start(self):
        print '\nStart Crawling\n'
        if not self._isDatabaseAvaliable():
            print 'Error: Unable to open database file.\n'
        else:
            self.isCrawling = True
            self.threadPool.startThreads() 
            self.periodStart = time.time() # 当前周期开始
            # 按照depth来抓取网页
            while self.currentDepth < self.depth+1:
                #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
                self._assignCurrentDepthTasks ()
                #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
                #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
                while self.threadPool.getTaskLeft() > 0:
                    print "Task left: ", self.threadPool.getTaskLeft()
                    time.sleep(3)
                print 'Depth %d Finish. Totally visited %d links. \n' % (
                    self.currentDepth, len(self.visitedGroups))
                log.info('Depth %d Finish. Total visited Links: %d\n' % (
                    self.currentDepth, len(self.visitedGroups)))
                self.currentDepth += 1
            self.stop()
            assert(self.threadPool.getTaskLeft() == 0)
            print "Main Crawling procedure finished!"

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        # save group ids to file
        for group_id in self.visitedGroups:
            self.groupfile.write(group_id + "\n")
        self.groupfile.close()
        self.database.close()

    def getAlreadyVisitedNum(self):
        #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedGroups数减去待访问的链接数
        if len(self.visitedGroups) == 0:
            return 0
        else:
            return len(self.visitedGroups) - self.threadPool.getTaskLeft()

    def _assignCurrentDepthTasks(self):
        """取出一个线程,并为这个线程分配任务,即抓取网页,并进行相应的访问控制
        """
        # 判断当前周期内访问的网页数目是否大于最大数目
        if self.currentPeriodVisits > self.MAX_VISITS_PER_MINUTE - 1:
            # 等待所有的网页处理完毕
            while self.threadPool.getTaskLeft() > 0:
                print "Waiting period ends..."
                time.sleep(1)
            timeNow = time.time()
            seconds = timeNow - self.periodStart
            if  seconds < 60: # 如果当前还没有过一分钟,则sleep
                time.sleep(int(seconds + 3))
            self.periodStart = time.time() # 重新设置开始时间
            self.currentPeriodVisits = 0
        # 从未访问的列表中抽出,并为其分配thread
        while len(self.unvisitedGroups) > 0:
            group_id = self.unvisitedGroups.popleft()
            #向任务队列分配任务
            url = "http://www.douban.com/group/" + group_id + "/"
            self.threadPool.putTask(self._taskHandler, url)
            # 添加已经访问过的小组id
            self.visitedGroups.add(group_id)
            
    def _taskHandler(self, url):
        """ 根据指定的url,抓取网页
        """
        print "Visiting : " + url
        webPage = WebPage(url)
        # 抓取页面内容
        flag = webPage.fetch()
        if flag:
            self.lock.acquire() #锁住该变量,保证操作的原子性
            self.currentPeriodVisits += 1
            self.lock.release()
            
            self._saveTaskResults(webPage)
            self._addUnvisitedGroups(webPage)
            return True
            
        # if page reading fails
        return False

    def _saveTaskResults(self, webPage):
        """将小组信息写入数据库
        """
        url, pageSource = webPage.getDatas()
        # 产生一个group对象
        dbgroup = Group(url, pageSource)
        # 写入数据库
        self.database.saveGroupInfo(dbgroup)
        
    def _addUnvisitedGroups(self, webPage):
        '''添加未访问的链接,并过滤掉非小组主页的链接。将有效的url放进UnvisitedGroups列表'''
        #对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次
        url, pageSource = webPage.getDatas()
        hrefs = self._getAllHrefsFromPage(url, pageSource)
        for href in hrefs:
            #print "URLs in page: ", href
            match_obj = REGroup.match(href)
            # 只有满足小组主页链接格式的链接才会被处理
            if self._isHttpOrHttpsProtocol(href) and (match_obj is not None):
                #pdb.set_trace()
                group_id = match_obj.group(1)
                #print "Group link: " + href
                if not self._isGroupRepeated(group_id):
                    # 将小组id放入待访问的小组列表中去
                    print "Add group id:", group_id
                    self.unvisitedGroups.append(group_id)

    def _getAllHrefsFromPage(self, url, pageSource):
        '''解析html源码,获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a',href=True)
        for a in results:
            #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf 
            #在bs4中不会被自动url编码,从而导致encodeException
            href = a.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)#处理相对链接的问题
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False

    def _isGroupRepeated(self, group_id):
        if (group_id in self.visitedGroups) or (group_id in self.unvisitedGroups):
            return True
        return False

    def _isDatabaseAvaliable(self):
        if self.database.isConn():
            return True
        return False

    def selfTesting(self, args):
        url = 'http://www.douban.com/group/insidestory/'
        print '\nVisiting http://www.douban.com/group/insidestory/'
        #测试网络,能否顺利获取百度源码
        pageSource = WebPage(url).fetch()
        if pageSource == None:
            print 'Please check your network and make sure it\'s connected.\n'
        #数据库测试
        elif not self._isDatabaseAvaliable():
            print 'Please make sure you have the permission to save data: %s\n' % args.dbFile
        #保存数据
        else:
            #self._saveTaskResults(url, pageSource)
            print 'Create logfile and database Successfully.'
            print 'Already save Baidu.com, Please check the database record.'
            print 'Seems No Problem!\n'
class TopicCrawler(object):

    def __init__(self, group_id, thread_num, group_info_path, topic_list_path, max_topics_num = 1000):
        """
        `group_id`          待抓取的group id
        `thread_num`         抓取的线程
        `group_info_path`   存储group本身的信息文件路径
        `topic_list_path`   保存所有的topic id list的文件路径
        """
        #线程池,指定线程数
        self.thread_pool = ThreadPool(thread_num)
        # 保存topic的线程
        self.save_thread = ThreadPool(1)

        # 写数据库的线程
        #self.DBThread = ThreadPool(1)
                
        # 保存group相关信息
        self.group_info_path = group_info_path
        self.topic_list_path = topic_list_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        #待访问的小组讨论页面
        self.unvisited_href = deque()
        # 访问失败的页面链接
        self.failed_href = set()
        
        self.lock = Lock() #线程锁
        
        self.group_id = group_id
        self.group_info = None # models.Group
        
        # 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取
        # 只保存topic id
        self.topic_list = list()

        self.is_crawling = False
        
        # self.database =  Database("DoubanGroup.db")
        
        # 每个Group抓取的最大topic个数
        self.MAX_TOPICS_NUM = max_topics_num
        #self.MAX_TOPICS_NUM = float('inf')
        # 每一页中显示的最多的topic数量,似乎每页中不一定显示25个topic
        #self.MAX_TOPICS_PER_PAGE = 25

    def start(self):
        print '\nStart Crawling topic list...\n'
        self.is_crawling = True
        self.thread_pool.startThreads()
        self.save_thread.startThreads()
        
        # 打开需要存储的文件
        self.group_info_file = codecs.open(self.group_info_path, 'w', 'utf-8')
        self.topic_list_file = codecs.open(self.topic_list_path, 'w', 'utf-8')
        
        url = "http://www.douban.com/group/" + group_id + "/"
        print "Add start url:", url
        self.unvisited_href.append(url)
        url = "http://www.douban.com/group/" + group_id + "/discussion?start=0"
        print "Add start urls:", url
        self.unvisited_href.append(url)
        
        #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
        self._assignInitTask()
        #等待当前线程池完成所有任务,当池内的所有任务完成时,才进行下一个小组的抓取
        #self.thread_pool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
        while self.thread_pool.getTaskLeft() > 0:
            #print "Task left: ", self.thread_pool.getTaskLeft()
            time.sleep(3)

        # 存储抓取的结果并等待存储线程结束
        while self.save_thread.getTaskLeft() > 0:
            print 'Wairting for saving thread. Taks left: %d' % self.save_thread.getTaskLeft()
            time.sleep(3)
            
        print "Stroring crawling topic list for: " + group_id
        print "Save to files..."
        #self._saveTopicList()
        
        print "Processing done with group: " + group_id
        log.info("Topic list crawling done with group %s.", group_id)
        
        self.stop()
        assert(self.thread_pool.getTaskLeft() == 0)
        
        # 关闭文件
        self.group_info_file.close()
        self.topic_list_file.close()
        
        print "Main Crawling procedure finished!"

    def stop(self):
        self.is_crawling = False
        self.thread_pool.stopThreads()
        self.save_thread.stopThreads()

    def _assignInitTask(self):
        """取出一个线程,并为这个线程分配任务,即抓取网页
        """ 
        while len(self.unvisited_href) > 0:
            # 从未访问的列表中抽出一个任务,并为其分配thread
            url = self.unvisited_href.popleft()
            self.thread_pool.putTask(self._taskHandler, url)
            # 添加已经访问过的小组id
            self.visited_href.add(url)
            
    def _taskHandler(self, url):
        """ 根据指定的url,抓取网页,并进行相应的访问控制
        """
        print "Visiting : " + url
        webPage = WebPage(url)
        # 抓取页面内容
        flag = webPage.fetch()
        if flag:
            url, pageSource = webPage.getDatas()
            # 抽取小组主页的置顶贴
            match_obj = REGroup.match(url)
            if match_obj is not None:
                group_id = match_obj.group(1)
                # 添加置顶贴的topic列表
                self._addStickTopic(webPage)
                return True
            
            # 抽取普通讨论贴
            match_obj = REDiscussion.match(url)
            if match_obj is not None:
                group_id = match_obj.group(1)
                start = int(match_obj.group(2))
                
                self._addTopicLink(webPage, start)
                return True
                
            log.error("抓取小组讨论列表时,发现网址格式错误。Group ID: %s, URL: %s" % (self.group_id, url))
            
        # if page reading fails
        self.failed_href.add(url)
        return False

    def _addStickTopic(self, webPage):
        """ 访问小组首页,添加置顶贴
        """
        #pdb.set_trace()
        
        group = Group(self.group_id)
        group.parse(webPage)
        
        self.group_info = group
        
        self.save_thread.putTask(self._saveGroupHandler, group)
        
    def _addTopicLink(self, webPage, start):
        '''将页面中所有的topic链接放入对应的topic列表,并同时加入
        下一步要访问的页面
        '''
        #对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次
        #pdb.set_trace()
        url, pageSource = webPage.getDatas()
        hrefs = self._getAllHrefsFromPage(url, pageSource)
        # 找到有效的链接
        topic_list = []
        for href in hrefs:
            # 只有满足小组topic链接格式的链接才会被处理
            match_obj = RETopic.match(href)
            if self._isHttpOrHttpsProtocol(href) and match_obj is not None:
                topic_list.append(match_obj.group(1))
            
        for topic in topic_list: 
            #print "Add group id:", self.group_id, "with topic link: ", href
            self.topic_list.append(topic)
        # 存储已经抓取的topic list
        self.save_thread.putTask(self._saveTopicHandler, topic_list)
                
        # 如果是首页,则需要添加所有的将来访问的页面
        if start == 0:
            print "Adding future visis for Group: " + self.group_id
            self._addFutureVisit(pageSource)
            
    def _saveTopicHandler(self, topic_list):
        """ 将每次从页面中抓取的topic id随时保存到文件中
        """
        for tid in topic_list:
            self.topic_list_file.write(tid + '\n')
        self.topic_list_file.flush()
        os.fsync(self.topic_list_file)
        
    def _saveGroupHandler(self, group):
        """ 保存group的基本信息,比如简介,创建日期等
        `group` models.Group
        """
        #print 'In saving thread'
        # 写入group的基本信息和置顶贴id
        self.group_info_file.write(group.getSimpleString('[=]'))
        self.group_info_file.flush()
        os.fsync(self.group_info_file)

    def _addFutureVisit(self, pageSource):
        """ 访问讨论列表的首页,并添加所有的将来要访问的链接
        """
        #pdb.set_trace()
        if not isinstance(pageSource, unicode):
            # 默认页面采用UTF-8编码
            page = etree.HTML(pageSource.decode('utf-8'))
        else:
            page = etree.HTML(pageSource)
            
        # 目前的做法基于以下观察:在每个列表页面,paginator部分总会显示总的页数
        # 得到总的页数后,便可以知道将来所有需要访问的页面
        paginator = page.xpath(u"//div[@class='paginator']/a")
        last_page = int(paginator[-1].text.strip())
        for i in range(1, last_page):
            # 控制加入topic列表的数量
            if i * 25 >= self.MAX_TOPICS_NUM:
                break
            url = "http://www.douban.com/group/" + self.group_id + "/discussion?start=" + str(i * 25)
            # 向线程池中添加任务:一次性添加
            self.thread_pool.putTask(self._taskHandler, url)
            # 添加已经访问过的小组id
            self.visited_href.add(url)

    def _getAllHrefsFromPage(self, url, pageSource):
        '''解析html源码,获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a',href=True)
        for a in results:
            #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf 
            #在bs4中不会被自动url编码,从而导致encodeException
            href = a.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)#处理相对链接的问题
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False
        
    def _getAlreadyVisitedNum(self):
        #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedGroups数减去待访问的链接数
        if len(self.visited_href) == 0:
            return 0
        else:
            return len(self.visited_href) - self.thread_pool.getTaskLeft()
            
    def _saveTopicList(self):
        """将抽取的结果存储在文件中
        Note: 这次是将存储过程放在主线程,将会阻塞抓取过程
        """
        group_id = self.group_id
        this_group = self.group_info
        print "For group %s: number of Stick post: %d, number of regurlar post: %d, total topics is: %d." % \
            (group_id, len(this_group.stick_topic_list), len(self.topic_list), len(this_group.stick_topic_list)+len(self.topic_list))
            
        # 将访问失败的网页存储起来
        log.info('抓取失败的网页:')
        for href in self.failed_href:
            log.info(href)
        
        # 保存Group的本身的信息
        f = open(group_info_path, "w")
        f.write(this_group.__repr__())
        f.close()
        
        # 存储Topic相关信息
        f = open(topic_list_path, 'w')
        for tid in this_group.stick_topic_list:
            f.write(tid + "\n")
            
        f.write("\n")
        for tid in self.topic_list:
            f.write(tid + "\n")
            
        f.close()
        
        self.topic_list = list()
        self.failed_href = set()
示例#20
0
class Crawler(object):
	def __init__(self, args=Strategy()):
		self.url = args.url 				
		self.max_depth = args.max_depth  	#指定网页深度
		self.max_count = args.max_count		#爬行最大数量
		self.concurrency = args.concurrency	#线程数
		self.timeout = args.timeout			#超时
		self.cookies = args.cookies 		#cookies
		self.ssl_verify = args.ssl_verify 	#ssl
		self.same_host = args.same_host		#是否只抓取相同host的链接
		self.same_domain = args.same_domain	#是否只抓取相同domain的链接

		self.currentDepth = 1  				#标注初始爬虫深度,从1开始
		self.keyword = args.keyword		 	#指定关键词,使用console的默认编码来解码
		

		self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数
		
		self.visitedHrefs = set()   		#已访问的链接
		self.unvisitedHrefs = deque()		#待访问的链接 
		self.unvisitedHrefs.append(args.url)#添加首个待访问的链接
		self.isCrawling = False				#标记爬虫是否开始执行任务

		self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'
		print self.file
		print 'args.url=\t',args.url

		#################
		#此句有问题
		self.database =  Database(args.dbFile)			#数据库
		# print 'hehe'

		self.lock = Lock()

	def start(self):
		print '\nStart Crawling\n'
		if not self._isDatabaseAvaliable():
			print 'Error: Unable to open database file.\n'
		else:
			pass
		if True:
			self.isCrawling = True
			self.threadPool.startThreads() 
			while self.currentDepth <= self.max_depth and len(self.visitedHrefs) <= self.max_count:
				#分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
				self._assignCurrentDepthTasks ()
				#等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
				#self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
				counter = 0
				while self.threadPool.getTaskLeft() and counter < 600:
					# print '>>taskleft:\t',self.threadPool.getTaskLeft()
					# print self.threadPool.taskQueue.qsize()
					# print self.threadPool.resultQueue.qsize()
					# print self.threadPool.running
					time.sleep(1)
					counter += 1
				# self.threadPool.taskJoin()

				print 'Depth %d Finish. Totally visited %d links. \n' % (
					self.currentDepth, len(self.visitedHrefs))
				log.info('Depth %d Finish. Total visited Links: %d\n' % (
					self.currentDepth, len(self.visitedHrefs)))
				self.currentDepth += 1
			self.stop()

	def stop(self):
		self.isCrawling = False
		self.threadPool.stopThreads()
		# self.database.close()

	def saveAllHrefsToFile(self,nonehtml=True):
		try:
			cf = CrawlerFile(url=self.url)
			contentlist = []
			hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
			for href in hrefs:
				if href.endswith('.html') and nonehtml:
					continue
				contentlist.append(href)
			cf.saveSection('Hrefs',contentlist,coverfile=True)
			# fp = open(self.file,'w')
			# fp.write('[Hrefs]'+os.linesep)
			# hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
			# rethrefs = []
			# print 'Totally ',len(hrefs), ' hrefs'
			# for href in hrefs:
			# 	if href.endswith('.html'):
			# 		continue
			# 	rethrefs.append(href)
			# 	fp.write(href + os.linesep)
			# 	print href
			# print 'Totally ',len(rethrefs), ' aviable hrefs'
			# fp.close()
		except:
			pass

	def _getCrawlerPaths(self,url):
		''' '''
		try:
			paths = []
			baseulp = urlparse(url)

			cf = CrawlerFile(url=url)
			urls = cf.getSection('Hrefs')
			#print urls

			for eachline in urls:
				eachline = eachline.replace('\r','')
  				eachline = eachline.replace('\n','')
				#print eachline
				eachulp = urlparse(eachline)
				if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
					fullpath = eachulp.path
					if fullpath.find('.') == -1 and fullpath.endswith('/') == False:
						fullpath += '/'
					pos = 0
					while True:
						pos = fullpath.find('/',pos)
						if pos == -1:
							break
						tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:pos]
						if tmppth.endswith('/'):
							#tmppth = tmppth[:-1]
							continue
						if tmppth not in paths:
							paths.append(tmppth)
						pos +=1

			return paths
		except Exception,e:
			print 'Exception:\t',e
			return [url]
示例#21
0
class Crawler(object):

    def __init__(self, args):
        #指定网页深度
        self.depth = args.depth
        #表示爬虫深度,从1开始
        self.currentDepth = 1
        #数据库
        self.database = Database(args.dbFile)
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)
        #已经访问的链接
        self.visitedHrefs = set()
        #待访问的页面
        self.unvisitedHrefs = deque()
        #首个待访问的页面
        self.url = args.url
        self.unvisitedHrefs.append(args.url)
        #标记爬虫是否开始执行
        self.isCrawling = False

    def isDatabaseAvaliable(self):
        if self.database.isConn():
            return True
        return False

    def _saveTaskResults(self, my_web):
        #只过滤包含正文的网页
        str = '.*\w{16}\.((html)|(shtml))'
        url, pageSource = my_web.getDatas()
        r = re.search(str, url)
        if r is not None:        
           soup = BeautifulSoup(pageSource)
           if soup.h2 is not None:
               title = unicode(soup.h2.string)
           elif soup.p is not None:
               title = unicode(soup.p.string)
           else:
               title = 'no title'
           text = ''
           for i in soup.find_all('p'):
              text += unicode(i.get_text())           
           #tmp = trieKmp.gao(title + text)
           t1 = trieKmp.gao(title)
           t2 = trieKmp.gao(text)
           tmp = []
           for i in xrange(len(t1)):
               if t1[i] != '0':
                   tmp.append('9')
               else:
                   tmp.append(t2[i])
           res = ''.join(tmp)
           #print 'res=', res          
          # print 'text=', text, 'tmp=', tmp
          # print 'tmp=', tmp
           self.database.saveData(url, title, text[: 40], res)
        return 0

    def _getAllHrefsFromPage(self, url, pageSource):
        '''用beautifulsoup解析源码,得到有效连接'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a', href = True)
        for a in results:
            #防止中文连接,encode转为utf8
            href = a.get('href').encode('utf8')
            if not href.strip().startswith('http'):           #去除前后多余的空格
                href = urljoin(url, href)
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        '''只处理http,https连i接'''
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            if not(self.url in href):
                return False
            if '.jpg' in href:
                return False
            return True
        return False

    def _isHrefRepeated(self, href):
        '''去掉重复的网页'''
        if href in self.visitedHrefs or href in self.unvisitedHrefs:
            return True
        return False

    def _addUnvisitedHrefs(self, my_web):
        '''添加未访问连接'''
        url, pageSource = my_web.getDatas()
        hrefs = self._getAllHrefsFromPage(url, pageSource)
        for href in hrefs:
            if self._isHttpOrHttpsProtocol(href):
                if not self._isHrefRepeated(href):
                    self.unvisitedHrefs.append(href)

    def getAlreadyVisitedNum(self):
        '''获得已经访问的网页的数目'''
        return len(self.visitedHrefs) - self.threadPool.getTaskLeft()

    def _taskHandler(self, url):
        '''以_开头的函数是放在队列里供线程提取用的'''
        my_web = WebPage(url)
        #print 'F**k', my_web.fetch()
        if my_web.fetch():
            #print 'has visited %s' % url
            self._saveTaskResults(my_web)
            self._addUnvisitedHrefs(my_web)

    def _assignCurrentDepthTasks(self):
        '''分配任务,该操作不阻塞'''
        while self.unvisitedHrefs:
            url = self.unvisitedHrefs.popleft()
            #分配给任务队列
            self.threadPool.putTask(self._taskHandler, url)
            self.visitedHrefs.add(url)

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()

    def start(self):
        print '\nstart crawling', self.url
        self.isCrawling = True
        self.threadPool.startThreads()
        while self.currentDepth < self.depth + 1:
            #分配任务(该操作不阻塞)
            self._assignCurrentDepthTasks()
            #等待该层任务结束
            #print 'sssssss'        
            #self.threadPool.taskJoin()
            while self.threadPool.getTaskLeft():
                #print self.threadPool.taskQueue.qsize()
                time.sleep(8)
            #print 'eeeeee'
            print 'depth %d finished. totally visited %d links.\n' % (self.currentDepth, len(self.visitedHrefs))
            log.info('depth %d finished. totally visited %d links.\n' % (self.currentDepth, len(self.visitedHrefs)))
            self.currentDepth += 1
        self.stop()

    def selfTesting(self):
        url = 'http://www.baidu.com'
        print '\nVisiting www.baidu.com using directly'
        my_web = WebPage(url)
        pageSource = my_web.fetch()
        #测试网络链接
        if pageSource == None:
            print 'please check your network'
        elif not self.isDatabaseAvaliable():
            print 'please make sure you have the permission to save data: %s\n' % args.dbFile
        else:
            self._saveTaskResults(my_web)
            print 'save data successfully'
            print 'seems all is ok'
示例#22
0
class Crawler(object):

    def __init__(self, args):
        #指定网页深度
        self.depth = args.depth  
        #标注初始爬虫深度,从1开始
        self.currentDepth = 1  
        #指定关键词,使用console的默认编码来解码
        self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
        #数据库
        self.database =  Database(args.dbFile)
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)  
        #已访问的链接
        self.visitedHrefs = set()   
        #待访问的链接 
        self.unvisitedHrefs = deque()    
        #添加首个待访问的链接
        self.unvisitedHrefs.append(args.url) 
        #标记爬虫是否开始执行任务
        self.isCrawling = False

    def start(self):
        print ('\nStart Crawling\n')
        if not self._isDatabaseAvaliable():
            print ('Error: Unable to open database file.\n')
        else:
            self.isCrawling = True
            self.threadPool.startThreads() 
            while self.currentDepth < self.depth+1:
                #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞)
                self._assignCurrentDepthTasks ()
                #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度
                #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt
                while self.threadPool.getTaskLeft():
                    time.sleep(8)
                print ('Depth %d Finish. Totally visited %d links. \n') % (
                    self.currentDepth, len(self.visitedHrefs))
                log.info('Depth %d Finish. Total visited Links: %d\n' % (
                    self.currentDepth, len(self.visitedHrefs)))
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.database.close()

    def getAlreadyVisitedNum(self):
        #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数
        return len(self.visitedHrefs) - self.threadPool.getTaskLeft()

    def _assignCurrentDepthTasks(self):
        while self.unvisitedHrefs:
            url = self.unvisitedHrefs.popleft()
            #向任务队列分配任务
            self.threadPool.putTask(self._taskHandler, url) 
            #标注该链接已被访问,或即将被访问,防止重复访问相同链接
            self.visitedHrefs.add(url)  
 
    def _taskHandler(self, url):
        #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
        webPage = WebPage(url)
        if webPage.fetch():
            self._saveTaskResults(webPage)
            self._addUnvisitedHrefs(webPage)

    def _saveTaskResults(self, webPage):
        url, pageSource = webPage.getDatas()
        try:
            if self.keyword:
                #使用正则的不区分大小写search比使用lower()后再查找要高效率(?)
                if re.search(self.keyword, pageSource, re.I):
                    self.database.saveData(url, pageSource, self.keyword) 
            else:
                self.database.saveData(url, pageSource)
        except (Exception, e):
            log.error(' URL: %s ' % url + traceback.format_exc())

    def _addUnvisitedHrefs(self, webPage):
        '''添加未访问的链接。将有效的url放进UnvisitedHrefs列表'''
        #对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次
        url, pageSource = webPage.getDatas()
        hrefs = self._getAllHrefsFromPage(url, pageSource)
        for href in hrefs:
            if self._isHttpOrHttpsProtocol(href):
                if not self._isHrefRepeated(href):
                    self.unvisitedHrefs.append(href)

    def _getAllHrefsFromPage(self, url, pageSource):
        '''解析html源码,获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a',href=True)
        for a in results:
            #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf 
            #在bs4中不会被自动url编码,从而导致encodeException
            href = a.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)#处理相对链接的问题
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False

    def _isHrefRepeated(self, href):
        if href in self.visitedHrefs or href in self.unvisitedHrefs:
            return True
        return False

    def _isDatabaseAvaliable(self):
        if self.database.isConn():
            return True
        return False

    def selfTesting(self, args):
        url = 'http://www.baidu.com/'
        print ('\nVisiting www.baidu.com')
        #测试网络,能否顺利获取百度源码
        pageSource = WebPage(url).fetch()
        if pageSource == None:
            print ('Please check your network and make sure it\'s connected.\n')
        #数据库测试
        elif not self._isDatabaseAvaliable():
            print ('Please make sure you have the permission to save data: %s\n') % args.dbFile
        #保存数据
        else:
            self._saveTaskResults(url, pageSource)
            print ('Create logfile and database Successfully.')
            print ('Already save Baidu.com, Please check the database record.')
            print ('Seems No Problem!\n')
示例#23
0
class CommentCrawler(object):
    
    def __init__(self, groupID, topicIDList, threadNum, topic_info_path, comment_info_path):
        """
        `groupID` 当前的Group id
        `topicIDList` 需要抓取的topic id的list
        `threadNum` 开启的线程数目
        `topic_info_path` 存储topic信息的文件
        `comment_info_path` 存储comment信息的文件
        """
        
        #线程池,指定线程数
        self.threadPool = ThreadPool(threadNum)  
        # 写数据库的线程
        #self.DBThread = ThreadPool(1)
        # 保证同时只有一个线程在写文件
        self.saveThread = ThreadPool(1)
        
        self.database =  Database("DoubanGroup.db")
        #self.database =  Database("test.db")
        
        self.topic_info_path = topic_info_path
        self.comment_info_path = comment_info_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visitedHref = set()
        # 抓取失败的topic id
        self.failed = set()
        
        
        # 依次为每个小组抽取topic评论
        self.groupID = groupID
        self.topicIDList = topicIDList # 等待抓取的topic列表
        
        # 存储结果
        # topic ID ==> Topic对象
        self.topicDict = dict()
        # 存放下一个处理的评论页数: topic ID ==> 1,2,3...
        self.nextPage = dict()
        # 已经抓取完毕的topic id集合
        self.finished = set()
        
        
        self.visitedHref = set() # 已经访问的网页

        self.isCrawling = False
        
        # 每个topic抓取的最多comments个数
        #self.MAX_COMMETS_NUM = 5000
        self.MAX_COMMETS_NUM = float('inf')
        
        # 每页的评论数量
        self.COMMENTS_PER_PAGE = 100

    def start(self):
        print '\nStart Crawling comment list for group: ' + self.groupID + '...\n'
        self.isCrawling = True
        self.threadPool.startThreads()
        self.saveThread.startThreads()
        
        # 打开需要存储的文件
        self.topic_info_file = codecs.open(self.topic_info_path, 'w', 'utf-8')
        self.comment_info_file = codecs.open(self.comment_info_path, 'w', 'utf-8')
        
        # 从数据库中读取topic id列表
        #self.topicIDList = self.database.readTopicList(self.groupID)
        self.topicIDList = list(set(self.topicIDList)) # 消除重复的topic id
        print "Total topics in group %s: %d." % (self.groupID, len(self.topicIDList))
        
        # 初始化添加任务
        for topic_id in self.topicIDList:
            url = "http://www.douban.com/group/topic/" + topic_id + "/"
            self.threadPool.putTask(self._taskHandler, url)
            # 下一页评论类似:http://www.douban.com/group/topic/35082953/?start=100
            self.nextPage[topic_id] = 1
        
        # 完全抛弃之前的抽取深度的概念,改为随时向thread pool推送任务
        while True:
            # 保证任何时候thread pool中的任务数为线程数的2倍
            print "Check threalPool queue..."
            while self.threadPool.getTaskLeft() < self.threadPool.threadNum * 2:
                # 获取未来需要访问的链接
                url = self._getFutureVisit()
                if url is not None: 
                    self.threadPool.putTask(self._taskHandler, url)
                else: # 已经不存在下一个链接
                    break
            # 每隔一秒检查thread pool的队列
            time.sleep(2)
            # 检查是否处理完毕
            if len(self.finished) == len(self.topicIDList):
                break
            elif len(self.finished) > len(self.topicIDList):
                assert(False)
            print 'Total topics: %d, Finished topic: %d' % (len(self.topicIDList), len(self.finished))
            
            remain = set(self.topicIDList) - self.finished
            if len(remain) < 5:
                print 'Unfinished: ', remain
                
        # 等待线程池中所有的任务都完成
        print "Totally visited: ", len(self.visitedHref)
        #pdb.set_trace()
        while self.threadPool.getTaskLeft() > 0:
            print "Task left in threadPool: ", self.threadPool.getTaskLeft()
            print "Task queue size: ", self.threadPool.taskQueue.qsize()
            print "Running tasks: ", self.threadPool.running
            time.sleep(2)
        
        while self.saveThread.getTaskLeft() > 0:
            print "Task left in save thread: ", self.saveThread.getTaskLeft()
            print "Task queue size: ", self.saveThread.taskQueue.qsize()
            print "Running tasks: ", self.saveThread.running
            time.sleep(2)
        
        # 记录抓取失败的topic id
        log.info('抓取失败的topic id:')
        s = ''
        for topic_id in self.failed:
            s += (topic_id + '\n')
        log.info('\n' + s)
        
        print "Terminating all threads..."
        self.stop()
        assert(self.threadPool.getTaskLeft() == 0)
        
        self.topic_info_file.close()
        self.comment_info_file.close()
        
        print "Main Crawling procedure finished!"
        
        print "Start to save result..."
        #self._saveCommentList()
        #self.saveComment2file()
        log.info("Processing done with group: %s" % (self.groupID))

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        self.saveThread.stopThreads()

        
    def _saveCommentList(self):
        """将抽取的结果存储在文件中,包括存储topic内容和评论内容
        Note: 这次是将存储过程放在主线程,将会阻塞抓取过程
        NOTE: 此函数已经不再使用
        """
        # 如果不存在目录,则创建它
        path = "data/" + self.groupID + "/"
        if not os.path.exists(path):
            os.mkdir(path)
            
        for topic_id in self.topicIDList:
            topic = self.topicDict[topic_id]
            path = "data/" + self.groupID + "/" + topic_id + ".txt"
            f = codecs.open(path, "w", "utf-8", errors='replace')
            f.write(topic.__repr__())
            f.close()
            
        # save the failed hrefs
        f = open("data/"+self.groupID+"/failed.txt", "w")
        for href in self.failed:
            f.write(href + "\n")
        f.close()
        
        # write comment structures
        path = "structure/" + self.groupID + "/"
        if not os.path.exists(path):
            os.mkdir(path)
        for topic_id in self.topicDict:
            atopic = self.topicDict[topic_id]
            path = "structure/" + self.groupID + "/" + topic_id + ".txt"
            f = codecs.open(path, "w", "utf-8", errors='replace')
            # 每一行:评论id,评论用户id,(引用评论id,引用评论的用户id)
            for comment in atopic.comment_list:
                f.write(comment.cid + " " + comment.user_id)
                if comment.quote is not None:
                    f.write(" " + comment.quote.cid + " " + comment.quote.user_id)
                f.write("\n")
            f.close()
            
    def saveComment2file(self):
        """ 直接将抓取结果存入文件中
        """
        ftopic = open(self.topic_info_path, 'w')
        fcomment = open(self.comment_info_path , 'w')
        for topic_id in self.topicDict:
            topic = self.topicDict[topic_id]
            s = topic.getSimpleString(delimiter = '[=]')
            ftopic.write(s + '\n[*ROWEND*]\n')
            for comment in topic.comment_list:
                cs = comment.getSimpleString(delimiter = '[=]')
                fcomment.write(cs + '\n[*ROWEND*]\n')
                
        ftopic.close()
        fcomment.close()
    
    def _save_handler(self, comment_list, topic):
        """ 将topic信息和comemnt信息保存到文件中
        """
        # 先保存comment_list id
        # 判断是否是第一次保存该topic
        if topic != None: # 如果是第一次保存,则需要保存topic的基本信息
            s = topic.getSimpleString('[=]')
            self.topic_info_file.write(s + '\n[*ROWEND*]\n')
        # 保存comment信息
        for comment in comment_list:
            s = comment.getSimpleString('[=]')
            self.comment_info_file.write(s + '\n[*ROWEND*]\n')
            
        # 保证已经写入到磁盘上,这样可以随时终止
        self.topic_info_file.flush()
        os.fsync(self.topic_info_file)
        self.comment_info_file.flush()
        os.fsync(self.comment_info_file)
        
    def getAlreadyVisitedNum(self):
        #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。
        #因此真实的已访问链接数为visitedGroups数减去待访问的链接数
        if len(self.visitedHref) == 0:
            return 0
        else:
            return len(self.visitedHref) - self.threadPool.getTaskLeft()

    def _getFutureVisit(self):
        """根据当前的访问情况,获取下一个要访问的网页
        """
        for topic_id in self.topicDict:
            if topic_id in self.finished:
                continue
            topic = self.topicDict[topic_id]
            if topic is None:
                continue
            if topic.max_comment_page <= 0:
                # 还未处理该topic的首页
                continue
            elif topic.max_comment_page == 1:
                # 该topic只有首页有评论
                continue
            else:
                # 该topic有多页评论
                next_start = self.nextPage[topic_id]
                url = "http://www.douban.com/group/topic/" + topic_id + "/?start=" + str(next_start * self.COMMENTS_PER_PAGE)
                if next_start <= topic.max_comment_page-1:
                    self.nextPage[topic_id] = next_start + 1
                    return url
                else:
                    continue
        
        return None
        
    def _taskHandler(self, url):
        """ 根据指定的url,抓取网页,并进行相应的访问控制
        """      
        print "Visiting : " + url
        webPage = WebPage(url)
        
        # 抓取页面内容
        flag = webPage.fetch()
        match_obj = RETopic.match(url)
        match_obj2 = REComment.match(url)
        
        if flag:
            if match_obj is not None:
                topic_id = match_obj.group(1)
                topic = Topic(topic_id, self.groupID)
                comment_list = topic.parse(webPage, True) # First page parsing
                self.topicDict[topic_id] = topic
                # 保存到文件
                self.saveThread.putTask(self._save_handler, comment_list, topic = topic)
                # 如果
            elif match_obj2 is not None:
                topic_id = match_obj2.group(1)
                start = int(match_obj2.group(2))
                # 抽取非第一页的评论数据
                if topic_id in self.topicDict:
                    topic = self.topicDict[topic_id]
                    if topic is None:
                        log.error('未知程序错误:该topic已经抓取结束,已释放相关内存,topic id:%s' % topic_id)
                        return False
                else:
                    log.error('未知程序错误:在topicDict字典中找不到topic id: %s' % topic_id)
                    self.failed.add(topic_id)
                    self.finished.add(topic_id)
                    return False
                    
                comment_list = topic.parse(webPage, False) # non-firstpage parsing
                # 保存到文件
                self.saveThread.putTask(self._save_handler, comment_list, topic = None)
            else:
                #pdb.set_trace()
                log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID))
            # 判断抓取是否结束,如果结束,则释放dict内存
            # 这个很重要,因为随着topic数量增多,内存会占很多
            if topic.isComplete():
                self.topicDict[topic_id] = None
                self.finished.add(topic_id)
                log.info('Topic: %s 抓取结束。' % topic_id)
                
            self.visitedHref.add(url)
            return True
        else:
            # 处理抓取失败的网页集合
            # 只要一个网页抓取失败,则加入到finished
            if match_obj is not None:
                # 讨论贴的第一页就没有抓到,则将其列入finished名单中
                topic_id = match_obj.group(1)
            elif match_obj2 is not None:
                topic_id = match_obj2.group(1)
                start = int(match_obj2.group(2))
            else:
                log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID))
            
            # 添加抓取失败的topic id和标记抓取结束的topic
            self.failed.add(topic_id)
            self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息
            self.visitedHref.add(url)
            return False
        

    def _getAllHrefsFromPage(self, url, pageSource):
        '''解析html源码,获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a',href=True)
        for a in results:
            #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf 
            #在bs4中不会被自动url编码,从而导致encodeException
            href = a.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)#处理相对链接的问题
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False