def saveProxies(self): #创建线程30个,并开启线程 threadPool = ThreadPool(30) threadPool.startThreads() #调用类 读取数据 #databases = database.DatabaseProxyIp() proxyip = self.proxyip_db.readData() #x循环读取数据进行匹配 for proxy in proxyip: #把测试函数放入线程中 threadPool.putTask(self.checkclientUrl, proxy[0]) #threadPool.putTask(self.checkProxy, proxy[0]) #flag,proxy = checkProxy(proxy[0]) #循环获取测试结果,成功写入数据库,失败修改available为0 ip_fail = 0 ip_ok = 0 ip_lock = 0 while threadPool.getTaskLeft(): flag, proxy = threadPool.getTaskResult() print flag, proxy if flag == 'ok': #print 'ok ', proxy self.proxyip_db.updateData(1, proxy) ip_ok = ip_ok + 1 elif flag == 'lock': self.proxyip_db.updateData(0, proxy) ip_lock = ip_lock + 1 else: self.proxyip_db.delData(proxy) ip_fail = ip_fail + 1 print '====> available ip: ', ip_ok, ' , lock ip: ', ip_lock, ' , fail ip: ', ip_fail, ' <====' threadPool.stopThreads()
def main(): threadPool = ThreadPool(5) threadPool.startThreads() f = codecs.open('tables/TopicInfo-all.txt', 'r', 'utf-8') # 读入unicode字符 count = 0 for line in f: line = line.strip() seg_list = line.split('[=]') if seg_list[1] == 'ustv': threadPool.putTask(task_handler, seg_list[0], seg_list) count += 1 f.close() while threadPool.getTaskLeft() > 0: time.sleep(10) print 'Waiting to finish. Task left: %d' % threadPool.getTaskLeft() log.info('Number of topics in ustv: %d' % count)
def saveProxies(): threadPool = ThreadPool(30) threadPool.startThreads() proxyFileOK = open('proxyOK.txt','a') proxyFileFail = open('proxyFail.txt','a') for proxy in proxiex: threadPool.putTask(checkProxy, proxy) while threadPool.getTaskLeft(): flag, proxy = threadPool.getTaskResult() print flag, proxy if flag == 'ok': proxyFileOK.write(proxy) proxyFileOK.write('\n') else: proxyFileFail.write(proxy) proxyFileFail.write('\n') threadPool.stopThreads() proxyFileOK.close() proxyFileFail.close()
def saveProxies(): threadPool = ThreadPool(30) threadPool.startThreads() proxyFileOK = open('proxyOK.txt', 'a') proxyFileFail = open('proxyFail.txt', 'a') for proxy in proxiex: threadPool.putTask(checkProxy, proxy) while threadPool.getTaskLeft(): flag, proxy = threadPool.getTaskResult() print flag, proxy if flag == 'ok': proxyFileOK.write(proxy) proxyFileOK.write('\n') else: proxyFileFail.write(proxy) proxyFileFail.write('\n') threadPool.stopThreads() proxyFileOK.close() proxyFileFail.close()
class Crawler(object): def __init__(self, args=Strategy()): self.url = args.url self.max_depth = args.max_depth #指定网页深度 self.max_count = args.max_count #爬行最大数量 self.concurrency = args.concurrency #线程数 self.timeout = args.timeout #超时 self.cookies = args.cookies #cookies self.ssl_verify = args.ssl_verify #ssl self.same_host = args.same_host #是否只抓取相同host的链接 self.same_domain = args.same_domain #是否只抓取相同domain的链接 self.currentDepth = 1 #标注初始爬虫深度,从1开始 self.keyword = args.keyword #指定关键词,使用console的默认编码来解码 self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数 self.visitedHrefs = set() #已访问的链接 self.unvisitedHrefs = deque() #待访问的链接 self.unvisitedHrefs.append(args.url) #添加首个待访问的链接 self.isCrawling = False #标记爬虫是否开始执行任务 self.file = BASEDIR + '/cache/crawler/' + genFilename( self.url) + '.txt' # print self.file # print 'args.url=\t',args.url ################# #此句有问题 self.database = Database(args.dbFile) #数据库 # print 'hehe' self.lock = Lock() def start(self): # print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): # print 'Error: Unable to open database file.\n' pass else: pass if True: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth <= self.max_depth and len( self.visitedHrefs) <= self.max_count: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks() #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt counter = 0 while self.threadPool.getTaskLeft() and counter < 600: # print '>>taskleft:\t',self.threadPool.getTaskLeft() # print self.threadPool.taskQueue.qsize() # print self.threadPool.resultQueue.qsize() # print self.threadPool.running time.sleep(1) counter += 1 # self.threadPool.taskJoin() # print 'Depth %d Finish. Totally visited %d links. \n' % ( # self.currentDepth, len(self.visitedHrefs)) # log.info('Depth %d Finish. Total visited Links: %d\n' % ( # self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() # self.database.close() def saveAllHrefsToFile(self, nonehtml=True): try: cf = CrawlerFile(url=self.url) contentlist = [] hrefs = [i for i in self.visitedHrefs ] + [j for j in self.unvisitedHrefs] for href in hrefs: if href.endswith('.html') and nonehtml: continue contentlist.append(href) cf.saveSection('Hrefs', contentlist, coverfile=True) # fp = open(self.file,'w') # fp.write('[Hrefs]'+os.linesep) # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] # rethrefs = [] # print 'Totally ',len(hrefs), ' hrefs' # for href in hrefs: # if href.endswith('.html'): # continue # rethrefs.append(href) # fp.write(href + os.linesep) # print href # print 'Totally ',len(rethrefs), ' aviable hrefs' # fp.close() except: pass def _getCrawlerPaths(self, url): ''' ''' try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection('Hrefs') #print urls for eachline in urls: eachline = eachline.replace('\r', '') eachline = eachline.replace('\n', '') #print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find('.') == -1 and fullpath.endswith( '/') == False: fullpath += '/' pos = 0 while True: # print 'fullpath=',fullpath pos = fullpath.find('/', pos) if pos == -1: break tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[: pos] if tmppth.endswith('/'): #tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos += 1 return paths except Exception, e: print 'Exception:\t', e return [url]
class Crawler(object): def __init__(self, args): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database() #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的链接 self.visitedHrefs = set() #待访问的链接 self.unvisitedHrefs = deque() #添加首个待访问的链接 self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行任务 self.isCrawling = False def start(self): print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): print 'Error: Unable to open database file.\n' else: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth < self.depth + 1: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks() #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt while self.threadPool.getTaskLeft(): time.sleep(8) print 'Depth %d Finish. Totally visited %d links. \n' % ( self.currentDepth, len(self.visitedHrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % (self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() self.database.close() def getAlreadyVisitedNum(self): #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数 return len(self.visitedHrefs) - self.threadPool.getTaskLeft() def _assignCurrentDepthTasks(self): while self.unvisitedHrefs: url = self.unvisitedHrefs.popleft() #向任务队列分配任务 self.threadPool.putTask(self._taskHandler, url) #标注该链接已被访问,或即将被访问,防止重复访问相同链接 self.visitedHrefs.add(url) def _taskHandler(self, url): #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 webPage = WebPage(url) if webPage.fetch(): self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage) def _saveTaskResults(self, webPage): url, pageSource = webPage.getDatas() try: if self.keyword: #使用正则的不区分大小写search比使用lower()后再查找要高效率(?) if re.search(self.keyword, pageSource, re.I): log.info( 'save data: url=%s, pageSource=%s, keyword=%s \n' % (url, pageSource, self.keyword)) self.database.saveData(url, pageSource, self.keyword) else: self.database.saveData(url, pageSource) except Exception, e: log.error(' URL: %s ' % url + traceback.format_exc())
class CommentCrawler(object): def __init__(self, group_id, topic_id_list, thread_num, base_path, topic_info_path, comment_info_path): """ `group_id` 当前的Group id `topic_id_list` 需要抓取的topic id的list `thread_num` 开启的线程数目 `topic_info_path` 存储topic信息的文件 `comment_info_path` 存储comment信息的文件 """ #线程池,指定线程数 self.thread_pool = ThreadPool(thread_num) # 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储 self.save_thread = ThreadPool(10) self.topic_info_path = topic_info_path self.comment_info_path = comment_info_path self.base_path = base_path # 已经访问的页面: Group id ==> True or False self.visited_href = set() # 抓取失败的topic id self.failed = set() # 依次为每个小组抽取topic评论 self.group_id = group_id self.topic_id_list = topic_id_list # 等待抓取的topic列表 # 存储结果 # topic ID ==> Topic对象 self.topic_dict = dict() # 存放下一个处理的评论页数: topic ID ==> 1,2,3... self.next_page = dict() # 已经抓取完毕的topic id集合 self.finished = set() self.is_crawling = False # 每个topic抓取的最多comments个数 #self.MAX_COMMETS_NUM = 5000 self.MAX_COMMETS_NUM = float('inf') # 每页的评论数量 self.COMMENTS_PER_PAGE = 100 def start(self): print '\nStart Crawling comment list for group: ' + self.group_id + '...\n' self.is_crawling = True self.thread_pool.startThreads() self.save_thread.startThreads() # 打开需要存储的文件 self.topic_info_file = codecs.open(self.topic_info_path, 'w', 'utf-8') self.comment_info_file = codecs.open(self.comment_info_path, 'w', 'utf-8') self.topic_id_list = list(set(self.topic_id_list)) # 消除重复的topic id print "Total topics in group %s: %d." % (self.group_id, len(self.topic_id_list)) # 初始化添加任务 for topic_id in self.topic_id_list: url = "http://www.douban.com/group/topic/" + topic_id + "/" self.thread_pool.putTask(self._taskHandler, url) # 下一页评论类似:http://www.douban.com/group/topic/35082953/?start=100 self.next_page[topic_id] = 1 # 完全抛弃之前的抽取深度的概念,改为随时向thread pool推送任务 while True: # 保证任何时候thread pool中的任务数为线程数的2倍 print "Check threalPool queue..." while self.thread_pool.getTaskLeft() < self.thread_pool.threadNum * 2: # 获取未来需要访问的链接 url = self._getFutureVisit() if url is not None: self.thread_pool.putTask(self._taskHandler, url) else: # 已经不存在下一个链接 break # 每隔一秒检查thread pool的队列 time.sleep(2) # 检查是否处理完毕 if len(self.finished) == len(self.topic_id_list): break elif len(self.finished) > len(self.topic_id_list): assert(False) print 'Total topics: %d, Finished topic: %d' % (len(self.topic_id_list), len(self.finished)) remain = set(self.topic_id_list) - self.finished if len(remain) < 5: print 'Unfinished: ', remain # 等待线程池中所有的任务都完成 print "Totally visited: ", len(self.visited_href) #pdb.set_trace() while self.thread_pool.getTaskLeft() > 0: print "Task left in threadPool: ", self.thread_pool.getTaskLeft() print "Task queue size: ", self.thread_pool.taskQueue.qsize() print "Running tasks: ", self.thread_pool.running time.sleep(2) # 检查保存线程完成情况 while self.save_thread.getTaskLeft() > 0: print "Task left in save thread: ", self.save_thread.getTaskLeft() print "Task queue size: ", self.save_thread.taskQueue.qsize() print "Running tasks: ", self.save_thread.running time.sleep(2) # 记录抓取失败的topic id log.info('抓取失败的topic id:') s = '' for topic_id in self.failed: s += (topic_id + '\n') log.info('\n' + s) print "Terminating all threads..." self.stop() assert(self.thread_pool.getTaskLeft() == 0) self.topic_info_file.close() self.comment_info_file.close() print "Main Crawling procedure finished!" print "Start to save result..." #self._saveCommentList() #self._saveComment2file() log.info("Processing done with group: %s" % (self.group_id)) def stop(self): self.is_crawling = False self.thread_pool.stopThreads() self.save_thread.stopThreads() def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() match_obj = RETopic.match(url) match_obj2 = REComment.match(url) if flag: if match_obj is not None: topic_id = match_obj.group(1) topic = Topic(topic_id, self.group_id) comment_list = topic.parse(webPage, isFirstPage = True) # First page parsing self.topic_dict[topic_id] = topic # 保存到单个文件(已废弃不用) #self.save_thread.putTask(self._saveHandler, comment_list, topic = topic) elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) # 抽取非第一页的评论数据 if topic_id in self.topic_dict: topic = self.topic_dict[topic_id] if topic is None: log.error('未知程序错误:结束topic id为%s的抽取,释放内存。' % topic_id) self.topic_dict[topic_id] = None return False else: # 这里的含义为:必须先处理第一页的评论,否则该topic_id不会作为self.topic_dict的键出现 log.error('错误:必须先抽取第一页的评论数据:topic id: %s' % topic_id) self.failed.add(topic_id) self.finished.add(topic_id) return False comment_list = topic.parse(webPage, isFirstPage = False) # non-firstpage parsing # 保存到单个文件(已废弃不用) #self.save_thread.putTask(self._saveHandler, comment_list, topic = None) else: #pdb.set_trace() log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id)) # 判断抓取是否结束,如果结束,则释放dict内存 # 这个很重要,因为随着topic数量增多,内存会占很多 if topic.isComplete(): self.save_thread.putTask(self._saveTopicHandler, self.topic_dict, topic_id) #self.topic_dict[topic_id] = None # 释放资源 self.finished.add(topic_id) log.info('Topic: %s 抓取结束。' % topic_id) self.visited_href.add(url) return True else: # 处理抓取失败的网页集合 # 只要一个网页抓取失败,则加入到finished if match_obj is not None: # 讨论贴的第一页就没有抓到,则将其列入finished名单中 topic_id = match_obj.group(1) elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) else: log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id)) # 添加抓取失败的topic id和标记抓取结束的topic self.failed.add(topic_id) self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息 self.visited_href.add(url) return False def _saveTopicHandler(self, topic_dict, topic_id): """ 存储抓取完毕的帖子信息以及其对应的Comment。 不过,跟_saveHandler函数不同的是,这里是按照topic id存储 @topic_dict 存储topic信息的字典 @topic_id 需要存储的topic id """ # 对评论进行排序,并查找quote comment topic = topic_dict[topic_id] topic.sortComment() topic_path = self.base_path + group_id + '/' + topic_id + '-info.txt' # 存储topic本身的信息 f = codecs.open(topic_path, 'w', 'utf-8') s = topic.getSimpleString('[=]') f.write(s + '\n') #f.write('[*ROWEND*]') # 存储comment信息,存储到相同的文件中 for comment in topic.comment_list: s = comment.getSimpleString('[=]') #f.write(s + '\n[*ROWEND*]\n') f.write(s + '\n') f.close() self.topic_dict[topic_id] = None # 释放资源 log.info("Topic: %s 存储结束" % topic_id) def _getFutureVisit(self): """根据当前的访问情况,获取下一个要访问的网页 """ for topic_id in self.topic_dict: if topic_id in self.finished: continue topic = self.topic_dict[topic_id] if topic is None: continue if topic.max_comment_page <= 0: # 还未处理该topic的首页 continue elif topic.max_comment_page == 1: # 该topic只有首页有评论 continue else: # 该topic有多页评论 next_start = self.next_page[topic_id] url = "http://www.douban.com/group/topic/" + topic_id + "/?start=" + str(next_start * self.COMMENTS_PER_PAGE) if next_start <= topic.max_comment_page-1: self.next_page[topic_id] = next_start + 1 return url else: continue return None def _getAllHrefsFromPage(self, url, pageSource): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a',href=True) for a in results: #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf #在bs4中不会被自动url编码,从而导致encodeException href = a.get('href').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False def _getAlreadyVisitedNum(self): #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedGroups数减去待访问的链接数 if len(self.visited_href) == 0: return 0 else: return len(self.visited_href) - self.thread_pool.getTaskLeft() '''
class PostIDCrawler(object): def __init__(self, start_url, thread_num, post_list_path, max_post_num = 1000): """ `group_id` 待抓取的group id `thread_num` 抓取的线程 `post_list_path` 保存所有的post id list的文件路径 """ #线程池,指定线程数 self.thread_pool = ThreadPool(thread_num) # 保存topic的线程 # NOTE: 这里只允许一个保存进程,因为要操作同一个文件 self.save_thread = ThreadPool(1) # 保存group相关信息 self.post_list_path = post_list_path # 已经访问的页面: Group id ==> True or False self.visited_href = set() #待访问的小组讨论页面 self.unvisited_href = deque() # 访问失败的页面链接 self.failed_href = set() self.start_url = start_url # 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取 # 只保存thread-id self.post_list = list() self.is_crawling = False # 每个Group抓取的最大topic个数 self.MAX_POST_NUM = max_post_num #self.MAX_POST_NUM = float('inf') # 每一页中显示的最多的topic数量,似乎每页中不一定显示25个topic #self.MAX_TOPICS_PER_PAGE = 25 def start(self): print '\nStart crawling post id list...\n' self.is_crawling = True self.thread_pool.startThreads() self.save_thread.startThreads() # 打开需要存储的文件 self.post_list_file = codecs.open(self.post_list_path, 'w', 'utf-8') print "Add start url:", self.start_url self.unvisited_href.append(self.start_url) #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignInitTask() #等待当前线程池完成所有任务,当池内的所有任务完成时,才进行下一个小组的抓取 #self.thread_pool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt while self.thread_pool.getTaskLeft() > 0: #print "Task left: ", self.thread_pool.getTaskLeft() # 判断是否已经抓了足够多的thread id if len(self.post_list) > self.MAX_POST_NUM: print u'已经达到最大讨论帖抓取数,即将退出抓取。' break else: print u'当前已抓取的讨论帖个数:', len(self.post_list) time.sleep(3) # 存储抓取的结果并等待存储线程结束 while self.save_thread.getTaskLeft() > 0: print 'Wairting for saving thread. Taks left: %d' % self.save_thread.getTaskLeft() time.sleep(3) log.info("Thread ID list crawling done.") self.stop() # 结束时可能还有任务,但是当前已经抓去了足够量的讨论帖 #assert(self.thread_pool.getTaskLeft() == 0) # 关闭文件 self.post_list_file.close() print "Main Crawling procedure finished!" def stop(self): self.is_crawling = False self.thread_pool.stopThreads() self.save_thread.stopThreads() def _assignInitTask(self): """取出一个线程,并为这个线程分配任务,即抓取网页 """ while len(self.unvisited_href) > 0: # 从未访问的列表中抽出一个任务,并为其分配thread url = self.unvisited_href.popleft() self.thread_pool.putTask(self._taskHandler, url) # 添加已经访问过的小组id self.visited_href.add(url) def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() if flag: url, pageSource = webPage.getDatas() hrefs = self._getAllHrefsFromPage(url, pageSource) # 找到有效的链接 post_list = [] next_page_url = None for href in hrefs: # 只有满足讨论帖链接格式的链接才会被处理 m = regex_post_first.match(href) if self._isHttpOrHttpsProtocol(href) and m is not None: post_list.append(m.group('post_id')) # 在当前页面中查找匹配“下一页”的链接 m = regex_next_page.match(href) if m != None and (not m.group() in self.visited_href): url = m.group() print 'Add next page link: ', url self.thread_pool.putTask(self._taskHandler, url) self.visited_href.add(url) for post_id in post_list: #print "Add thread link: ", thread self.post_list.append(post_id) # 存储已经抓取的topic list self.save_thread.putTask(self._saveTopicHandler, post_list) else: log.error(u"抓取讨论帖列表时,发现网址格式错误。URL: %s" % url) # if page reading fails self.failed_href.add(url) return False def _saveTopicHandler(self, post_list): """ 将每次从页面中抓取的topic id随时保存到文件中 NOTE: saveThread只有一个,所以这里不会造成访问冲突 """ for tid in post_list: self.post_list_file.write(tid + '\n') self.post_list_file.flush() os.fsync(self.post_list_file) def _getAllHrefsFromPage(self, url, pageSource): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a',href=True) for a in results: #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf #在bs4中不会被自动url编码,从而导致encodeException href = a.get('href').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False def _getAlreadyVisitedNum(self): #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedGroups数减去待访问的链接数 if len(self.visited_href) == 0: return 0 else: return len(self.visited_href) - self.thread_pool.getTaskLeft()
class Crawler(object): def __init__(self, args): #指定网页深度 self.depth = args.depth #表示爬虫深度,从1开始 self.currentDepth = 1 #数据库 self.database = Database(args.dbFile) #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已经访问的链接 self.visitedHrefs = set() #待访问的页面 self.unvisitedHrefs = deque() #首个待访问的页面 self.url = args.url self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行 self.isCrawling = False def isDatabaseAvaliable(self): if self.database.isConn(): return True return False def _saveTaskResults(self, my_web): #只过滤包含正文的网页 str = '.*\w{16}\.((html)|(shtml))' url, pageSource = my_web.getDatas() r = re.search(str, url) if r is not None: soup = BeautifulSoup(pageSource) if soup.h2 is not None: title = unicode(soup.h2.string) elif soup.p is not None: title = unicode(soup.p.string) else: title = 'no title' text = '' for i in soup.find_all('p'): text += unicode(i.get_text()) #tmp = trieKmp.gao(title + text) t1 = trieKmp.gao(title) t2 = trieKmp.gao(text) tmp = [] for i in xrange(len(t1)): if t1[i] != '0': tmp.append('9') else: tmp.append(t2[i]) res = ''.join(tmp) #print 'res=', res # print 'text=', text, 'tmp=', tmp # print 'tmp=', tmp self.database.saveData(url, title, text[: 40], res) return 0 def _getAllHrefsFromPage(self, url, pageSource): '''用beautifulsoup解析源码,得到有效连接''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a', href = True) for a in results: #防止中文连接,encode转为utf8 href = a.get('href').encode('utf8') if not href.strip().startswith('http'): #去除前后多余的空格 href = urljoin(url, href) hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): '''只处理http,https连i接''' protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': if not(self.url in href): return False if '.jpg' in href: return False return True return False def _isHrefRepeated(self, href): '''去掉重复的网页''' if href in self.visitedHrefs or href in self.unvisitedHrefs: return True return False def _addUnvisitedHrefs(self, my_web): '''添加未访问连接''' url, pageSource = my_web.getDatas() hrefs = self._getAllHrefsFromPage(url, pageSource) for href in hrefs: if self._isHttpOrHttpsProtocol(href): if not self._isHrefRepeated(href): self.unvisitedHrefs.append(href) def getAlreadyVisitedNum(self): '''获得已经访问的网页的数目''' return len(self.visitedHrefs) - self.threadPool.getTaskLeft() def _taskHandler(self, url): '''以_开头的函数是放在队列里供线程提取用的''' my_web = WebPage(url) #print 'F**k', my_web.fetch() if my_web.fetch(): #print 'has visited %s' % url self._saveTaskResults(my_web) self._addUnvisitedHrefs(my_web) def _assignCurrentDepthTasks(self): '''分配任务,该操作不阻塞''' while self.unvisitedHrefs: url = self.unvisitedHrefs.popleft() #分配给任务队列 self.threadPool.putTask(self._taskHandler, url) self.visitedHrefs.add(url) def stop(self): self.isCrawling = False self.threadPool.stopThreads() self.database.close() def start(self): print '\nstart crawling', self.url self.isCrawling = True self.threadPool.startThreads() while self.currentDepth < self.depth + 1: #分配任务(该操作不阻塞) self._assignCurrentDepthTasks() #等待该层任务结束 #print 'sssssss' #self.threadPool.taskJoin() while self.threadPool.getTaskLeft(): #print self.threadPool.taskQueue.qsize() time.sleep(8) #print 'eeeeee' print 'depth %d finished. totally visited %d links.\n' % (self.currentDepth, len(self.visitedHrefs)) log.info('depth %d finished. totally visited %d links.\n' % (self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def selfTesting(self): url = 'http://www.baidu.com' print '\nVisiting www.baidu.com using directly' my_web = WebPage(url) pageSource = my_web.fetch() #测试网络链接 if pageSource == None: print 'please check your network' elif not self.isDatabaseAvaliable(): print 'please make sure you have the permission to save data: %s\n' % args.dbFile else: self._saveTaskResults(my_web) print 'save data successfully' print 'seems all is ok'
class CommentCrawler(object): def __init__(self, section_id, post_id_list, crawler_thread_num, save_thread_num, post_base_path): """ `section_id` 天涯的板块名称 `post_id_list` 需要抓取的post id的list `thread_num` 开启的线程数目 post_base_path: 存储抓取结果的基本目录,每个post一个文件,并以该post的ID命名 """ # 抓取网页的线程池,指定线程数 self.thread_pool = ThreadPool(crawler_thread_num) # 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储 self.save_thread = ThreadPool(save_thread_num) # 保存抓取信息的base path self.base_path = post_base_path # 已经访问的页面: Group id ==> True or False self.visited_href = set() self.visited_post = set() # 已经添加访问的页面的id集合 self.finished = set() # 已经抓取完毕的topic id集合 # 抓取失败的topic id self.failed = set() # 依次为每个小组抽取topic评论 self.section_id = section_id self.post_id_list = post_id_list # 等待抓取的topic列表 self.current_post_id_list = list(post_id_list) # 用于逐步向任务列表中加入post id # 存储结果 # topic ID ==> Topic对象 self.post_dict = dict() # 存放下一个处理的评论页数: topic ID ==> 1,2,3... self.next_page = dict() self.is_crawling = False # 每个topic抓取的最多comments个数 #self.MAX_COMMETS_NUM = 1000 self.MAX_COMMETS_NUM = float('inf') def start(self): print '\nStart Crawling comment list for group: ' + self.section_id + '...\n' self.is_crawling = True self.thread_pool.startThreads() self.save_thread.startThreads() self.post_id_list = list(set(self.post_id_list)) # 消除重复的topic id print u"Total number of post in section %s: %d." % (self.section_id, len(self.post_id_list)) # 先为字典建立所有的key,避免出现“RuntimeError: dictionary changed size during iteration”错误 for post_id in self.post_id_list: self.post_dict[post_id] = None # 初始化添加一部分post的id到列表 for i in xrange(self.thread_pool.threadNum * 2): # TODO: 这里的URL模式只是针对“天涯杂谈”部分的链接 if len(self.current_post_id_list) > 0: post_id = self.current_post_id_list.pop() url = "http://bbs.tianya.cn/post-%s-%s-1.shtml" % (self.section_id, post_id) self.thread_pool.putTask(self._taskHandler, url) # 完全抛弃之前的抽取深度的概念,改为随时向thread pool推送任务 while True: # 保证任何时候thread pool中的任务数最少为线程数的2倍 print "Check threalPool queue..." while self.thread_pool.getTaskLeft() < self.thread_pool.threadNum * 2: # 获取未来需要访问的链接 url = self._getFutureVisit() if url is not None: self.thread_pool.putTask(self._taskHandler, url) else: # 已经不存在下一个链接 #print 'No future visit url.' break # 每隔一秒检查thread pool的队列 time.sleep(2) # 检查是否处理完毕 if len(self.finished) == len(self.post_id_list): break elif len(self.finished) > len(self.post_id_list): assert(False) print 'Number of task in LIFO queue: ', self.thread_pool.taskQueue.qsize() print 'Number of task in save queue: ', self.save_thread.taskQueue.qsize() print 'Total posts: %d, Finished topic: %d' % (len(self.post_id_list), len(self.finished)) # 等待线程池中所有的任务都完成 print "Totally visited: ", len(self.visited_href) #pdb.set_trace() while self.thread_pool.getTaskLeft() > 0: print "Task left in threadPool: ", self.thread_pool.getTaskLeft() print "Task queue size: ", self.thread_pool.taskQueue.qsize() print "Running tasks: ", self.thread_pool.running time.sleep(2) # 检查保存线程完成情况 while self.save_thread.getTaskLeft() > 0: print "Task left in save thread: ", self.save_thread.getTaskLeft() print "Task queue size: ", self.save_thread.taskQueue.qsize() print "Running tasks: ", self.save_thread.running time.sleep(2) # 记录抓取失败的topic id log.info(u'抓取失败的post id:') s = '' for post_id in self.failed: s += (post_id + '\n') log.info('\n' + s) print "Terminating all threads..." self.stop() assert(self.thread_pool.getTaskLeft() == 0) print "Main Crawling procedure finished!" log.info("Processing done with tianya section: %s" % (self.section_id)) def stop(self): self.is_crawling = False self.thread_pool.stopThreads() self.save_thread.stopThreads() def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() m = regex_post.match(url) if m == None: log.info(u'Post链接格式错误:%s in Group: %s.' % (url, self.section_id)) return True else: log.info(u'访问:' + url) comment_page_index = int(m.group('page_index')) post_id = m.group('post_id') if flag: if comment_page_index == 1: # 首页评论 post = Post(post_id, self.section_id) # 解析讨论帖的第一个页:包括原帖内容和评论内容 comment_list = post.parse(webPage, isFirstPage = True) # First page parsing self.post_dict[post_id] = post self.next_page[post_id] = 2 elif comment_page_index > 1: # 抽取非第一页的评论数据 if post_id in self.post_dict: post = self.post_dict[post_id] else: # 这里的含义为:必须先处理第一页的评论,否则该post_id不会作为self.topic_dict的键出现 log.error(u'错误:必须先抽取第一页的评论数据:post id: %s' % post_id) self.failed.add(topic_id) self.finished.add(topic_id) return False if post is None: log.error(u'未知程序错误:结束post id为%s的抽取,释放内存。' % post_id) self.post_dict[post_id] = post return False comment_list = post.parse(webPage, isFirstPage = False) # non-firstpage parsing else: log.info(u'Post链接格式错误:%s in Group: %s.' % (url, self.section_id)) # 判断抓取是否结束,如果结束,则释放dict内存 # 这个很重要,因为随着topic数量增多,内存会占很多 if post.is_complete(): self.save_thread.putTask(self._saveTopicHandler, self.post_dict, post_id) self.finished.add(post_id) log.info(u'Topic: %s 抓取结束。' % post_id) self.visited_href.add(url) return True else: # 处理抓取失败的网页集合,只要一个网页抓取失败,则加入到finished # 添加抓取失败的post id和标记抓取结束的post self.failed.add(post_id) self.finished.add(post_id) # 有可能已经记录了一些某些topic的信息 self.visited_href.add(url) return False def _getFutureVisit(self): """根据当前的访问情况,获取下一个要访问的网页 """ # 先检查当前正在抓取的所有帖子,目标是尽快将其抓去完并保存 for post_id in self.post_dict: if post_id in self.finished: continue post = self.post_dict[post_id] if post is None: continue if post.total_comment_page <= 0: # 还未处理该topic的首页 continue elif post.total_comment_page == 1: # 该topic只有首页有评论 continue else: # 该topic有多页评论 next_page_index = self.next_page[post_id] if next_page_index > post.total_comment_page: continue else: url = "http://bbs.tianya.cn/post-free-%s-%d.shtml" % (post_id, next_page_index) self.next_page[post_id] = next_page_index + 1 return url # 如果当前正在处理的帖子全部已经抓取完毕,则加入新帖子post_id if len(self.current_post_id_list) > 0: post_id = self.current_post_id_list.pop() url = "http://bbs.tianya.cn/post-%s-%s-1.shtml" % (self.section_id, post_id) return url else: return None def _saveTopicHandler(self, post_dict, post_id): """ 存储抓取完毕的帖子信息以及其对应的Comment。 不过,跟_saveHandler函数不同的是,这里是按照topic id存储 post_dict 存储topic信息的字典 post_id 需要存储的post id NOTE: 因为随时可能被ctrl+C终止,而此时可能有些帖子的内容还没有保存完成。 """ #TODO: 添加SIGINT handler # 在保存结果钱,对评论进行排序,并查找quote comment post = post_dict[post_id] post.sort_comment() post_path = self.base_path + self.section_id + '/' + post_id + '-info.txt' # 存储topic本身的信息 f = codecs.open(post_path, 'w', 'utf-8') s = post.get_simple_string('[=]') f.write(s + '\n') # 存储comment信息,存储到相同的文件中 for comment in post.comment_list: s = comment.get_simple_string('[=]') f.write(s + '\n') f.close() # 释放资源 # NOTE: del self.post_dict[post_id]不能达到效果,如果需要根据post_id是否在 # self.post_dict中来判断是否已经抓取该帖子 self.post_dict[post_id] = None self.next_page[post_id] = None log.info(u"Topic: %s 存储结束。" % post_id) def _getAllHrefsFromPage(self, url, pageSource): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a',href=True) for a in results: #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf #在bs4中不会被自动url编码,从而导致encodeException href = a.get('href').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False
class Crawler(object): def __init__(self, args): #指定网页深度 self.depth = args.depth #表示爬虫深度,从1开始 self.currentDepth = 1 #数据库 self.database = Database(args.dbFile) #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已经访问的链接 self.visitedHrefs = set() #待访问的页面 self.unvisitedHrefs = deque() #首个待访问的页面 self.url = args.url self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行 self.isCrawling = False def isDatabaseAvaliable(self): if self.database.isConn(): return True return False def _saveTaskResults(self, my_web): #只过滤包含正文的网页 str = '.*\w{16}\.((html)|(shtml))' url, pageSource = my_web.getDatas() r = re.search(str, url) if r is not None: soup = BeautifulSoup(pageSource) if soup.h2 is not None: title = unicode(soup.h2.string) elif soup.p is not None: title = unicode(soup.p.string) else: title = 'no title' text = '' for i in soup.find_all('p'): text += unicode(i.get_text()) #tmp = trieKmp.gao(title + text) t1 = trieKmp.gao(title) t2 = trieKmp.gao(text) tmp = [] for i in xrange(len(t1)): if t1[i] != '0': tmp.append('9') else: tmp.append(t2[i]) res = ''.join(tmp) #print 'res=', res # print 'text=', text, 'tmp=', tmp # print 'tmp=', tmp self.database.saveData(url, title, text[:40], res) return 0 def _getAllHrefsFromPage(self, url, pageSource): '''用beautifulsoup解析源码,得到有效连接''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a', href=True) for a in results: #防止中文连接,encode转为utf8 href = a.get('href').encode('utf8') if not href.strip().startswith('http'): #去除前后多余的空格 href = urljoin(url, href) hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): '''只处理http,https连i接''' protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': if not (self.url in href): return False if '.jpg' in href: return False return True return False def _isHrefRepeated(self, href): '''去掉重复的网页''' if href in self.visitedHrefs or href in self.unvisitedHrefs: return True return False def _addUnvisitedHrefs(self, my_web): '''添加未访问连接''' url, pageSource = my_web.getDatas() hrefs = self._getAllHrefsFromPage(url, pageSource) for href in hrefs: if self._isHttpOrHttpsProtocol(href): if not self._isHrefRepeated(href): self.unvisitedHrefs.append(href) def getAlreadyVisitedNum(self): '''获得已经访问的网页的数目''' return len(self.visitedHrefs) - self.threadPool.getTaskLeft() def _taskHandler(self, url): '''以_开头的函数是放在队列里供线程提取用的''' my_web = WebPage(url) #print 'F**k', my_web.fetch() if my_web.fetch(): #print 'has visited %s' % url self._saveTaskResults(my_web) self._addUnvisitedHrefs(my_web) def _assignCurrentDepthTasks(self): '''分配任务,该操作不阻塞''' while self.unvisitedHrefs: url = self.unvisitedHrefs.popleft() #分配给任务队列 self.threadPool.putTask(self._taskHandler, url) self.visitedHrefs.add(url) def stop(self): self.isCrawling = False self.threadPool.stopThreads() self.database.close() def start(self): print '\nstart crawling', self.url self.isCrawling = True self.threadPool.startThreads() while self.currentDepth < self.depth + 1: #分配任务(该操作不阻塞) self._assignCurrentDepthTasks() #等待该层任务结束 #print 'sssssss' #self.threadPool.taskJoin() while self.threadPool.getTaskLeft(): #print self.threadPool.taskQueue.qsize() time.sleep(8) #print 'eeeeee' print 'depth %d finished. totally visited %d links.\n' % ( self.currentDepth, len(self.visitedHrefs)) log.info('depth %d finished. totally visited %d links.\n' % (self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def selfTesting(self): url = 'http://www.baidu.com' print '\nVisiting www.baidu.com using directly' my_web = WebPage(url) pageSource = my_web.fetch() #测试网络链接 if pageSource == None: print 'please check your network' elif not self.isDatabaseAvaliable(): print 'please make sure you have the permission to save data: %s\n' % args.dbFile else: self._saveTaskResults(my_web) print 'save data successfully' print 'seems all is ok'
class Crawler(object): def __init__(self, args): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database(args.dbFile) #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的链接 self.visitedHrefs = set() #待访问的链接 self.unvisitedHrefs = deque() #添加首个待访问的链接 self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行任务 self.isCrawling = False def start(self): print '\nStart Crawling\n' if not self._isDatabaseAvailable(): print 'Error: Unable to open database file.\n' else: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth < self.depth+1: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks () #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt while self.threadPool.getTaskLeft(): time.sleep(8) print 'Depth %d Finish. Totally visited %d links. \n' % ( self.currentDepth, len(self.visitedHrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() self.database.close() def getAlreadyVisitedNum(self): #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数 return len(self.visitedHrefs) - self.threadPool.getTaskLeft() def _assignCurrentDepthTasks(self): while self.unvisitedHrefs: url = self.unvisitedHrefs.popleft() #向任务队列分配任务 self.threadPool.putTask(self._taskHandler, url) #标注该链接已被访问,或即将被访问,防止重复访问相同链接 self.visitedHrefs.add(url) def _taskHandler(self, url): #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 webPage = WebPage(url) if webPage.fetch(): self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage) def _saveTaskResults(self, webPage): url, pageSource = webPage.getDatas() try: if self.keyword: #使用正则的不区分大小写search比使用lower()后再查找要高效率(?) if re.search(self.keyword, pageSource, re.I): self.database.saveData(url, pageSource, self.keyword) else: self.database.saveData(url, pageSource) except Exception, e: log.error(' URL: %s ' % url + traceback.format_exc())
class Crawler(threading.Thread): def __init__(self, args, queue): threading.Thread.__init__(self) #指定网页深度 self.depth = args['depth'] #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 self.keyword = args['keyword'].decode(getdefaultlocale()[1]) #数据库 self.database = Database(db="bt_tornado") #线程池,指定线程数 self.threadPool = ThreadPool(args['threadNum']) #已访问的链接 self.visitedHrefs = set() #待访问的链接 self.unvisitedHrefs = deque() #添加待访问的链接 for url in args['url']: self.unvisitedHrefs.append(url) #标记爬虫是否开始执行任务 self.isCrawling = False # allow or deny crawl url self.entryFilter = args['entryFilter'] # allow to output back url self.yieldFilter = args['yieldFilter'] # self.callbackFilter = args['callbackFilter'] # self.db = args['db'] self.collection = args['collection'] # communication queue self.queue = queue def run(self): print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): print 'Error: Unable to open database file.\n' else: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth < self.depth+1: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks () #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt while self.threadPool.getTaskLeft(): time.sleep(8) print 'Depth %d Finish. Totally visited %d links. \n' % ( self.currentDepth, len(self.visitedHrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() self.database.close() #use queue to communicate between threads self.queue.get() self.queue.task_done() def getAlreadyVisitedNum(self): #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数 return len(self.visitedHrefs) - self.threadPool.getTaskLeft() def _assignCurrentDepthTasks(self): while self.unvisitedHrefs: url = self.unvisitedHrefs.popleft() if not self.__entry_filter(url): self.visitedHrefs.add(url) continue #向任务队列分配任务 self.threadPool.putTask(self._taskHandler, url) #标注该链接已被访问,或即将被访问,防止重复访问相同链接 self.visitedHrefs.add(url) def _callback_filter(self, webPage): #parse the web page to do sth url , pageSource = webPage.getDatas() for tmp in self.callbackFilter['List']: if re.compile(tmp,re.I|re.U).search(url): self.callbackFilter['func'](webPage) def _taskHandler(self, url): #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 webPage = WebPage(url) tmp = webPage.fetch() if tmp: self._callback_filter(webPage) self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage) def _saveTaskResults(self, webPage): url, pageSource = webPage.getDatas() _id = md5(url).hexdigest() try: if self.__yield_filter(url): query = {"id": _id} document = {"id": _id, "url":url, "createTime": datetime.now()} self.database.saveData(query=query, collection=self.collection, document=document) except Exception, e: log.error(' URL: %s ' % url + traceback.format_exc())
class Crawler(object): def __init__(self, args): self.depth = args.depth self.currentDepth = 1 self.keyword = args.keyword.decode(getdefaultlocale()[1]) self.database = Database(args.dbFile) self.threadPool = ThreadPool(args.threadNum) self.visitedHrefs = set() self.unvisitedHrefs = deque() self.unvisitedHrefs.append(args.url) self.isCrawling = False def start(self): print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): print 'Error: Unable to open database file.\n' else: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth < self.depth+1: self._assignCurrentDepthTasks () #self.threadPool.taskJoin()Ctrl-C Interupt while self.threadPool.getTaskLeft(): time.sleep(8) print 'Depth %d Finish. Totally visited %d links. \n' % ( self.currentDepth, len(self.visitedHrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() self.database.close() def getAlreadyVisitedNum(self): #visitedHrefstaskQueue visitedHrefs return len(self.visitedHrefs) - self.threadPool.getTaskLeft() def _assignCurrentDepthTasks(self): while self.unvisitedHrefs: url = self.unvisitedHrefs.popleft() self.threadPool.putTask(self._taskHandler, url) self.visitedHrefs.add(url) def _taskHandler(self, url): webPage = WebPage(url) if webPage.fetch(): self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage) def _saveTaskResults(self, webPage): url, pageSource = webPage.getDatas() try: if self.keyword: if re.search(self.keyword, pageSource, re.I): self.database.saveData(url, pageSource, self.keyword) else: self.database.saveData(url, pageSource) except Exception, e: log.error(' URL: %s ' % url + traceback.format_exc())
class CommentCrawler(object): def __init__(self, groupID, topicIDList, threadNum, topic_info_path, comment_info_path): """ `groupID` 当前的Group id `topicIDList` 需要抓取的topic id的list `threadNum` 开启的线程数目 `topic_info_path` 存储topic信息的文件 `comment_info_path` 存储comment信息的文件 """ #线程池,指定线程数 self.threadPool = ThreadPool(threadNum) # 写数据库的线程 #self.DBThread = ThreadPool(1) # 保证同时只有一个线程在写文件 self.saveThread = ThreadPool(1) self.database = Database("DoubanGroup.db") #self.database = Database("test.db") self.topic_info_path = topic_info_path self.comment_info_path = comment_info_path # 已经访问的页面: Group id ==> True or False self.visitedHref = set() # 抓取失败的topic id self.failed = set() # 依次为每个小组抽取topic评论 self.groupID = groupID self.topicIDList = topicIDList # 等待抓取的topic列表 # 存储结果 # topic ID ==> Topic对象 self.topicDict = dict() # 存放下一个处理的评论页数: topic ID ==> 1,2,3... self.nextPage = dict() # 已经抓取完毕的topic id集合 self.finished = set() self.visitedHref = set() # 已经访问的网页 self.isCrawling = False # 每个topic抓取的最多comments个数 #self.MAX_COMMETS_NUM = 5000 self.MAX_COMMETS_NUM = float('inf') # 每页的评论数量 self.COMMENTS_PER_PAGE = 100 def start(self): print '\nStart Crawling comment list for group: ' + self.groupID + '...\n' self.isCrawling = True self.threadPool.startThreads() self.saveThread.startThreads() # 打开需要存储的文件 self.topic_info_file = codecs.open(self.topic_info_path, 'w', 'utf-8') self.comment_info_file = codecs.open(self.comment_info_path, 'w', 'utf-8') # 从数据库中读取topic id列表 #self.topicIDList = self.database.readTopicList(self.groupID) self.topicIDList = list(set(self.topicIDList)) # 消除重复的topic id print "Total topics in group %s: %d." % (self.groupID, len(self.topicIDList)) # 初始化添加任务 for topic_id in self.topicIDList: url = "http://www.douban.com/group/topic/" + topic_id + "/" self.threadPool.putTask(self._taskHandler, url) # 下一页评论类似:http://www.douban.com/group/topic/35082953/?start=100 self.nextPage[topic_id] = 1 # 完全抛弃之前的抽取深度的概念,改为随时向thread pool推送任务 while True: # 保证任何时候thread pool中的任务数为线程数的2倍 print "Check threalPool queue..." while self.threadPool.getTaskLeft() < self.threadPool.threadNum * 2: # 获取未来需要访问的链接 url = self._getFutureVisit() if url is not None: self.threadPool.putTask(self._taskHandler, url) else: # 已经不存在下一个链接 break # 每隔一秒检查thread pool的队列 time.sleep(2) # 检查是否处理完毕 if len(self.finished) == len(self.topicIDList): break elif len(self.finished) > len(self.topicIDList): assert(False) print 'Total topics: %d, Finished topic: %d' % (len(self.topicIDList), len(self.finished)) remain = set(self.topicIDList) - self.finished if len(remain) < 5: print 'Unfinished: ', remain # 等待线程池中所有的任务都完成 print "Totally visited: ", len(self.visitedHref) #pdb.set_trace() while self.threadPool.getTaskLeft() > 0: print "Task left in threadPool: ", self.threadPool.getTaskLeft() print "Task queue size: ", self.threadPool.taskQueue.qsize() print "Running tasks: ", self.threadPool.running time.sleep(2) while self.saveThread.getTaskLeft() > 0: print "Task left in save thread: ", self.saveThread.getTaskLeft() print "Task queue size: ", self.saveThread.taskQueue.qsize() print "Running tasks: ", self.saveThread.running time.sleep(2) # 记录抓取失败的topic id log.info('抓取失败的topic id:') s = '' for topic_id in self.failed: s += (topic_id + '\n') log.info('\n' + s) print "Terminating all threads..." self.stop() assert(self.threadPool.getTaskLeft() == 0) self.topic_info_file.close() self.comment_info_file.close() print "Main Crawling procedure finished!" print "Start to save result..." #self._saveCommentList() #self.saveComment2file() log.info("Processing done with group: %s" % (self.groupID)) def stop(self): self.isCrawling = False self.threadPool.stopThreads() self.saveThread.stopThreads() def _saveCommentList(self): """将抽取的结果存储在文件中,包括存储topic内容和评论内容 Note: 这次是将存储过程放在主线程,将会阻塞抓取过程 NOTE: 此函数已经不再使用 """ # 如果不存在目录,则创建它 path = "data/" + self.groupID + "/" if not os.path.exists(path): os.mkdir(path) for topic_id in self.topicIDList: topic = self.topicDict[topic_id] path = "data/" + self.groupID + "/" + topic_id + ".txt" f = codecs.open(path, "w", "utf-8", errors='replace') f.write(topic.__repr__()) f.close() # save the failed hrefs f = open("data/"+self.groupID+"/failed.txt", "w") for href in self.failed: f.write(href + "\n") f.close() # write comment structures path = "structure/" + self.groupID + "/" if not os.path.exists(path): os.mkdir(path) for topic_id in self.topicDict: atopic = self.topicDict[topic_id] path = "structure/" + self.groupID + "/" + topic_id + ".txt" f = codecs.open(path, "w", "utf-8", errors='replace') # 每一行:评论id,评论用户id,(引用评论id,引用评论的用户id) for comment in atopic.comment_list: f.write(comment.cid + " " + comment.user_id) if comment.quote is not None: f.write(" " + comment.quote.cid + " " + comment.quote.user_id) f.write("\n") f.close() def saveComment2file(self): """ 直接将抓取结果存入文件中 """ ftopic = open(self.topic_info_path, 'w') fcomment = open(self.comment_info_path , 'w') for topic_id in self.topicDict: topic = self.topicDict[topic_id] s = topic.getSimpleString(delimiter = '[=]') ftopic.write(s + '\n[*ROWEND*]\n') for comment in topic.comment_list: cs = comment.getSimpleString(delimiter = '[=]') fcomment.write(cs + '\n[*ROWEND*]\n') ftopic.close() fcomment.close() def _save_handler(self, comment_list, topic): """ 将topic信息和comemnt信息保存到文件中 """ # 先保存comment_list id # 判断是否是第一次保存该topic if topic != None: # 如果是第一次保存,则需要保存topic的基本信息 s = topic.getSimpleString('[=]') self.topic_info_file.write(s + '\n[*ROWEND*]\n') # 保存comment信息 for comment in comment_list: s = comment.getSimpleString('[=]') self.comment_info_file.write(s + '\n[*ROWEND*]\n') # 保证已经写入到磁盘上,这样可以随时终止 self.topic_info_file.flush() os.fsync(self.topic_info_file) self.comment_info_file.flush() os.fsync(self.comment_info_file) def getAlreadyVisitedNum(self): #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedGroups数减去待访问的链接数 if len(self.visitedHref) == 0: return 0 else: return len(self.visitedHref) - self.threadPool.getTaskLeft() def _getFutureVisit(self): """根据当前的访问情况,获取下一个要访问的网页 """ for topic_id in self.topicDict: if topic_id in self.finished: continue topic = self.topicDict[topic_id] if topic is None: continue if topic.max_comment_page <= 0: # 还未处理该topic的首页 continue elif topic.max_comment_page == 1: # 该topic只有首页有评论 continue else: # 该topic有多页评论 next_start = self.nextPage[topic_id] url = "http://www.douban.com/group/topic/" + topic_id + "/?start=" + str(next_start * self.COMMENTS_PER_PAGE) if next_start <= topic.max_comment_page-1: self.nextPage[topic_id] = next_start + 1 return url else: continue return None def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() match_obj = RETopic.match(url) match_obj2 = REComment.match(url) if flag: if match_obj is not None: topic_id = match_obj.group(1) topic = Topic(topic_id, self.groupID) comment_list = topic.parse(webPage, True) # First page parsing self.topicDict[topic_id] = topic # 保存到文件 self.saveThread.putTask(self._save_handler, comment_list, topic = topic) # 如果 elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) # 抽取非第一页的评论数据 if topic_id in self.topicDict: topic = self.topicDict[topic_id] if topic is None: log.error('未知程序错误:该topic已经抓取结束,已释放相关内存,topic id:%s' % topic_id) return False else: log.error('未知程序错误:在topicDict字典中找不到topic id: %s' % topic_id) self.failed.add(topic_id) self.finished.add(topic_id) return False comment_list = topic.parse(webPage, False) # non-firstpage parsing # 保存到文件 self.saveThread.putTask(self._save_handler, comment_list, topic = None) else: #pdb.set_trace() log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID)) # 判断抓取是否结束,如果结束,则释放dict内存 # 这个很重要,因为随着topic数量增多,内存会占很多 if topic.isComplete(): self.topicDict[topic_id] = None self.finished.add(topic_id) log.info('Topic: %s 抓取结束。' % topic_id) self.visitedHref.add(url) return True else: # 处理抓取失败的网页集合 # 只要一个网页抓取失败,则加入到finished if match_obj is not None: # 讨论贴的第一页就没有抓到,则将其列入finished名单中 topic_id = match_obj.group(1) elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) else: log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID)) # 添加抓取失败的topic id和标记抓取结束的topic self.failed.add(topic_id) self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息 self.visitedHref.add(url) return False def _getAllHrefsFromPage(self, url, pageSource): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a',href=True) for a in results: #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf #在bs4中不会被自动url编码,从而导致encodeException href = a.get('href').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False
class Crawler(object): def __init__(self, args, startURLs): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 #self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database(args.dbFile) # store group ids to fils, using UTF-8 self.groupfile = codecs.open("GroupID.txt", "w", "UTF-8") #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的小组id self.visitedGroups = set() #待访问的小组id self.unvisitedGroups = deque() # 所有的Group信息 self.groupInfo = [] self.lock = Lock() #线程锁 #标记爬虫是否开始执行任务 self.isCrawling = False # 添加尚未访问的小组首页 for url in startURLs: match_obj = REGroup.match(url) print "Add start urls:", url assert(match_obj != None) self.unvisitedGroups.append(match_obj.group(1)) # 一分钟内允许的最大访问次数 self.MAX_VISITS_PER_MINUTE = 10 # 当前周期内已经访问的网页数量 self.currentPeriodVisits = 0 # 将一分钟当作一个访问周期,记录当前周期的开始时间 self.periodStart = time.time() # 使用当前时间初始化 def start(self): print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): print 'Error: Unable to open database file.\n' else: self.isCrawling = True self.threadPool.startThreads() self.periodStart = time.time() # 当前周期开始 # 按照depth来抓取网页 while self.currentDepth < self.depth+1: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks () #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt while self.threadPool.getTaskLeft() > 0: print "Task left: ", self.threadPool.getTaskLeft() time.sleep(3) print 'Depth %d Finish. Totally visited %d links. \n' % ( self.currentDepth, len(self.visitedGroups)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.currentDepth, len(self.visitedGroups))) self.currentDepth += 1 self.stop() assert(self.threadPool.getTaskLeft() == 0) print "Main Crawling procedure finished!" def stop(self): self.isCrawling = False self.threadPool.stopThreads() # save group ids to file for group_id in self.visitedGroups: self.groupfile.write(group_id + "\n") self.groupfile.close() self.database.close() def getAlreadyVisitedNum(self): #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedGroups数减去待访问的链接数 if len(self.visitedGroups) == 0: return 0 else: return len(self.visitedGroups) - self.threadPool.getTaskLeft() def _assignCurrentDepthTasks(self): """取出一个线程,并为这个线程分配任务,即抓取网页,并进行相应的访问控制 """ # 判断当前周期内访问的网页数目是否大于最大数目 if self.currentPeriodVisits > self.MAX_VISITS_PER_MINUTE - 1: # 等待所有的网页处理完毕 while self.threadPool.getTaskLeft() > 0: print "Waiting period ends..." time.sleep(1) timeNow = time.time() seconds = timeNow - self.periodStart if seconds < 60: # 如果当前还没有过一分钟,则sleep time.sleep(int(seconds + 3)) self.periodStart = time.time() # 重新设置开始时间 self.currentPeriodVisits = 0 # 从未访问的列表中抽出,并为其分配thread while len(self.unvisitedGroups) > 0: group_id = self.unvisitedGroups.popleft() #向任务队列分配任务 url = "http://www.douban.com/group/" + group_id + "/" self.threadPool.putTask(self._taskHandler, url) # 添加已经访问过的小组id self.visitedGroups.add(group_id) def _taskHandler(self, url): """ 根据指定的url,抓取网页 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() if flag: self.lock.acquire() #锁住该变量,保证操作的原子性 self.currentPeriodVisits += 1 self.lock.release() self._saveTaskResults(webPage) self._addUnvisitedGroups(webPage) return True # if page reading fails return False def _saveTaskResults(self, webPage): """将小组信息写入数据库 """ url, pageSource = webPage.getDatas() # 产生一个group对象 dbgroup = Group(url, pageSource) # 写入数据库 self.database.saveGroupInfo(dbgroup) def _addUnvisitedGroups(self, webPage): '''添加未访问的链接,并过滤掉非小组主页的链接。将有效的url放进UnvisitedGroups列表''' #对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次 url, pageSource = webPage.getDatas() hrefs = self._getAllHrefsFromPage(url, pageSource) for href in hrefs: #print "URLs in page: ", href match_obj = REGroup.match(href) # 只有满足小组主页链接格式的链接才会被处理 if self._isHttpOrHttpsProtocol(href) and (match_obj is not None): #pdb.set_trace() group_id = match_obj.group(1) #print "Group link: " + href if not self._isGroupRepeated(group_id): # 将小组id放入待访问的小组列表中去 print "Add group id:", group_id self.unvisitedGroups.append(group_id) def _getAllHrefsFromPage(self, url, pageSource): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a',href=True) for a in results: #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf #在bs4中不会被自动url编码,从而导致encodeException href = a.get('href').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False def _isGroupRepeated(self, group_id): if (group_id in self.visitedGroups) or (group_id in self.unvisitedGroups): return True return False def _isDatabaseAvaliable(self): if self.database.isConn(): return True return False def selfTesting(self, args): url = 'http://www.douban.com/group/insidestory/' print '\nVisiting http://www.douban.com/group/insidestory/' #测试网络,能否顺利获取百度源码 pageSource = WebPage(url).fetch() if pageSource == None: print 'Please check your network and make sure it\'s connected.\n' #数据库测试 elif not self._isDatabaseAvaliable(): print 'Please make sure you have the permission to save data: %s\n' % args.dbFile #保存数据 else: #self._saveTaskResults(url, pageSource) print 'Create logfile and database Successfully.' print 'Already save Baidu.com, Please check the database record.' print 'Seems No Problem!\n'
class Crawler(object): def __init__(self, args=Strategy()): self.url = args.url self.max_depth = args.max_depth #指定网页深度 self.max_count = args.max_count #爬行最大数量 self.concurrency = args.concurrency #线程数 self.timeout = args.timeout #超时 self.cookies = args.cookies #cookies self.ssl_verify = args.ssl_verify #ssl self.same_host = args.same_host #是否只抓取相同host的链接 self.same_domain = args.same_domain #是否只抓取相同domain的链接 self.currentDepth = 1 #标注初始爬虫深度,从1开始 self.keyword = args.keyword #指定关键词,使用console的默认编码来解码 self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数 self.visitedHrefs = set() #已访问的链接 self.unvisitedHrefs = deque() #待访问的链接 self.unvisitedHrefs.append(args.url)#添加首个待访问的链接 self.isCrawling = False #标记爬虫是否开始执行任务 self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt' print self.file print 'args.url=\t',args.url ################# #此句有问题 self.database = Database(args.dbFile) #数据库 # print 'hehe' self.lock = Lock() def start(self): print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): print 'Error: Unable to open database file.\n' else: pass if True: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth <= self.max_depth and len(self.visitedHrefs) <= self.max_count: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks () #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt counter = 0 while self.threadPool.getTaskLeft() and counter < 600: # print '>>taskleft:\t',self.threadPool.getTaskLeft() # print self.threadPool.taskQueue.qsize() # print self.threadPool.resultQueue.qsize() # print self.threadPool.running time.sleep(1) counter += 1 # self.threadPool.taskJoin() print 'Depth %d Finish. Totally visited %d links. \n' % ( self.currentDepth, len(self.visitedHrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() # self.database.close() def saveAllHrefsToFile(self,nonehtml=True): try: cf = CrawlerFile(url=self.url) contentlist = [] hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] for href in hrefs: if href.endswith('.html') and nonehtml: continue contentlist.append(href) cf.saveSection('Hrefs',contentlist,coverfile=True) # fp = open(self.file,'w') # fp.write('[Hrefs]'+os.linesep) # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] # rethrefs = [] # print 'Totally ',len(hrefs), ' hrefs' # for href in hrefs: # if href.endswith('.html'): # continue # rethrefs.append(href) # fp.write(href + os.linesep) # print href # print 'Totally ',len(rethrefs), ' aviable hrefs' # fp.close() except: pass def _getCrawlerPaths(self,url): ''' ''' try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection('Hrefs') #print urls for eachline in urls: eachline = eachline.replace('\r','') eachline = eachline.replace('\n','') #print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find('.') == -1 and fullpath.endswith('/') == False: fullpath += '/' pos = 0 while True: pos = fullpath.find('/',pos) if pos == -1: break tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:pos] if tmppth.endswith('/'): #tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos +=1 return paths except Exception,e: print 'Exception:\t',e return [url]
class Crawler(object): def __init__(self, args): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database(args.dbFile) #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的链接 self.visitedHrefs = set() #待访问的链接 self.unvisitedHrefs = deque() #添加首个待访问的链接 self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行任务 self.isCrawling = False def start(self): print ('\nStart Crawling\n') if not self._isDatabaseAvaliable(): print ('Error: Unable to open database file.\n') else: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth < self.depth+1: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks () #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt while self.threadPool.getTaskLeft(): time.sleep(8) print ('Depth %d Finish. Totally visited %d links. \n') % ( self.currentDepth, len(self.visitedHrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() self.database.close() def getAlreadyVisitedNum(self): #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数 return len(self.visitedHrefs) - self.threadPool.getTaskLeft() def _assignCurrentDepthTasks(self): while self.unvisitedHrefs: url = self.unvisitedHrefs.popleft() #向任务队列分配任务 self.threadPool.putTask(self._taskHandler, url) #标注该链接已被访问,或即将被访问,防止重复访问相同链接 self.visitedHrefs.add(url) def _taskHandler(self, url): #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 webPage = WebPage(url) if webPage.fetch(): self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage) def _saveTaskResults(self, webPage): url, pageSource = webPage.getDatas() try: if self.keyword: #使用正则的不区分大小写search比使用lower()后再查找要高效率(?) if re.search(self.keyword, pageSource, re.I): self.database.saveData(url, pageSource, self.keyword) else: self.database.saveData(url, pageSource) except (Exception, e): log.error(' URL: %s ' % url + traceback.format_exc()) def _addUnvisitedHrefs(self, webPage): '''添加未访问的链接。将有效的url放进UnvisitedHrefs列表''' #对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次 url, pageSource = webPage.getDatas() hrefs = self._getAllHrefsFromPage(url, pageSource) for href in hrefs: if self._isHttpOrHttpsProtocol(href): if not self._isHrefRepeated(href): self.unvisitedHrefs.append(href) def _getAllHrefsFromPage(self, url, pageSource): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a',href=True) for a in results: #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf #在bs4中不会被自动url编码,从而导致encodeException href = a.get('href').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False def _isHrefRepeated(self, href): if href in self.visitedHrefs or href in self.unvisitedHrefs: return True return False def _isDatabaseAvaliable(self): if self.database.isConn(): return True return False def selfTesting(self, args): url = 'http://www.baidu.com/' print ('\nVisiting www.baidu.com') #测试网络,能否顺利获取百度源码 pageSource = WebPage(url).fetch() if pageSource == None: print ('Please check your network and make sure it\'s connected.\n') #数据库测试 elif not self._isDatabaseAvaliable(): print ('Please make sure you have the permission to save data: %s\n') % args.dbFile #保存数据 else: self._saveTaskResults(url, pageSource) print ('Create logfile and database Successfully.') print ('Already save Baidu.com, Please check the database record.') print ('Seems No Problem!\n')
class TopicCrawler(object): def __init__(self, group_id, thread_num, group_info_path, topic_list_path, max_topics_num = 1000): """ `group_id` 待抓取的group id `thread_num` 抓取的线程 `group_info_path` 存储group本身的信息文件路径 `topic_list_path` 保存所有的topic id list的文件路径 """ #线程池,指定线程数 self.thread_pool = ThreadPool(thread_num) # 保存topic的线程 self.save_thread = ThreadPool(1) # 写数据库的线程 #self.DBThread = ThreadPool(1) # 保存group相关信息 self.group_info_path = group_info_path self.topic_list_path = topic_list_path # 已经访问的页面: Group id ==> True or False self.visited_href = set() #待访问的小组讨论页面 self.unvisited_href = deque() # 访问失败的页面链接 self.failed_href = set() self.lock = Lock() #线程锁 self.group_id = group_id self.group_info = None # models.Group # 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取 # 只保存topic id self.topic_list = list() self.is_crawling = False # self.database = Database("DoubanGroup.db") # 每个Group抓取的最大topic个数 self.MAX_TOPICS_NUM = max_topics_num #self.MAX_TOPICS_NUM = float('inf') # 每一页中显示的最多的topic数量,似乎每页中不一定显示25个topic #self.MAX_TOPICS_PER_PAGE = 25 def start(self): print '\nStart Crawling topic list...\n' self.is_crawling = True self.thread_pool.startThreads() self.save_thread.startThreads() # 打开需要存储的文件 self.group_info_file = codecs.open(self.group_info_path, 'w', 'utf-8') self.topic_list_file = codecs.open(self.topic_list_path, 'w', 'utf-8') url = "http://www.douban.com/group/" + group_id + "/" print "Add start url:", url self.unvisited_href.append(url) url = "http://www.douban.com/group/" + group_id + "/discussion?start=0" print "Add start urls:", url self.unvisited_href.append(url) #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignInitTask() #等待当前线程池完成所有任务,当池内的所有任务完成时,才进行下一个小组的抓取 #self.thread_pool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt while self.thread_pool.getTaskLeft() > 0: #print "Task left: ", self.thread_pool.getTaskLeft() time.sleep(3) # 存储抓取的结果并等待存储线程结束 while self.save_thread.getTaskLeft() > 0: print 'Wairting for saving thread. Taks left: %d' % self.save_thread.getTaskLeft() time.sleep(3) print "Stroring crawling topic list for: " + group_id print "Save to files..." #self._saveTopicList() print "Processing done with group: " + group_id log.info("Topic list crawling done with group %s.", group_id) self.stop() assert(self.thread_pool.getTaskLeft() == 0) # 关闭文件 self.group_info_file.close() self.topic_list_file.close() print "Main Crawling procedure finished!" def stop(self): self.is_crawling = False self.thread_pool.stopThreads() self.save_thread.stopThreads() def _assignInitTask(self): """取出一个线程,并为这个线程分配任务,即抓取网页 """ while len(self.unvisited_href) > 0: # 从未访问的列表中抽出一个任务,并为其分配thread url = self.unvisited_href.popleft() self.thread_pool.putTask(self._taskHandler, url) # 添加已经访问过的小组id self.visited_href.add(url) def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() if flag: url, pageSource = webPage.getDatas() # 抽取小组主页的置顶贴 match_obj = REGroup.match(url) if match_obj is not None: group_id = match_obj.group(1) # 添加置顶贴的topic列表 self._addStickTopic(webPage) return True # 抽取普通讨论贴 match_obj = REDiscussion.match(url) if match_obj is not None: group_id = match_obj.group(1) start = int(match_obj.group(2)) self._addTopicLink(webPage, start) return True log.error("抓取小组讨论列表时,发现网址格式错误。Group ID: %s, URL: %s" % (self.group_id, url)) # if page reading fails self.failed_href.add(url) return False def _addStickTopic(self, webPage): """ 访问小组首页,添加置顶贴 """ #pdb.set_trace() group = Group(self.group_id) group.parse(webPage) self.group_info = group self.save_thread.putTask(self._saveGroupHandler, group) def _addTopicLink(self, webPage, start): '''将页面中所有的topic链接放入对应的topic列表,并同时加入 下一步要访问的页面 ''' #对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次 #pdb.set_trace() url, pageSource = webPage.getDatas() hrefs = self._getAllHrefsFromPage(url, pageSource) # 找到有效的链接 topic_list = [] for href in hrefs: # 只有满足小组topic链接格式的链接才会被处理 match_obj = RETopic.match(href) if self._isHttpOrHttpsProtocol(href) and match_obj is not None: topic_list.append(match_obj.group(1)) for topic in topic_list: #print "Add group id:", self.group_id, "with topic link: ", href self.topic_list.append(topic) # 存储已经抓取的topic list self.save_thread.putTask(self._saveTopicHandler, topic_list) # 如果是首页,则需要添加所有的将来访问的页面 if start == 0: print "Adding future visis for Group: " + self.group_id self._addFutureVisit(pageSource) def _saveTopicHandler(self, topic_list): """ 将每次从页面中抓取的topic id随时保存到文件中 """ for tid in topic_list: self.topic_list_file.write(tid + '\n') self.topic_list_file.flush() os.fsync(self.topic_list_file) def _saveGroupHandler(self, group): """ 保存group的基本信息,比如简介,创建日期等 `group` models.Group """ #print 'In saving thread' # 写入group的基本信息和置顶贴id self.group_info_file.write(group.getSimpleString('[=]')) self.group_info_file.flush() os.fsync(self.group_info_file) def _addFutureVisit(self, pageSource): """ 访问讨论列表的首页,并添加所有的将来要访问的链接 """ #pdb.set_trace() if not isinstance(pageSource, unicode): # 默认页面采用UTF-8编码 page = etree.HTML(pageSource.decode('utf-8')) else: page = etree.HTML(pageSource) # 目前的做法基于以下观察:在每个列表页面,paginator部分总会显示总的页数 # 得到总的页数后,便可以知道将来所有需要访问的页面 paginator = page.xpath(u"//div[@class='paginator']/a") last_page = int(paginator[-1].text.strip()) for i in range(1, last_page): # 控制加入topic列表的数量 if i * 25 >= self.MAX_TOPICS_NUM: break url = "http://www.douban.com/group/" + self.group_id + "/discussion?start=" + str(i * 25) # 向线程池中添加任务:一次性添加 self.thread_pool.putTask(self._taskHandler, url) # 添加已经访问过的小组id self.visited_href.add(url) def _getAllHrefsFromPage(self, url, pageSource): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a',href=True) for a in results: #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf #在bs4中不会被自动url编码,从而导致encodeException href = a.get('href').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False def _getAlreadyVisitedNum(self): #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedGroups数减去待访问的链接数 if len(self.visited_href) == 0: return 0 else: return len(self.visited_href) - self.thread_pool.getTaskLeft() def _saveTopicList(self): """将抽取的结果存储在文件中 Note: 这次是将存储过程放在主线程,将会阻塞抓取过程 """ group_id = self.group_id this_group = self.group_info print "For group %s: number of Stick post: %d, number of regurlar post: %d, total topics is: %d." % \ (group_id, len(this_group.stick_topic_list), len(self.topic_list), len(this_group.stick_topic_list)+len(self.topic_list)) # 将访问失败的网页存储起来 log.info('抓取失败的网页:') for href in self.failed_href: log.info(href) # 保存Group的本身的信息 f = open(group_info_path, "w") f.write(this_group.__repr__()) f.close() # 存储Topic相关信息 f = open(topic_list_path, 'w') for tid in this_group.stick_topic_list: f.write(tid + "\n") f.write("\n") for tid in self.topic_list: f.write(tid + "\n") f.close() self.topic_list = list() self.failed_href = set()
class Crawler(object): def __init__(self, args): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database(args.dbFile) #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的链接 self.visitedHrefs = set() #待访问的链接 self.unvisitedHrefs = deque() #添加首个待访问的链接 #self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行任务 self.isCrawling = False self.domainPattern = re.compile(r"^([0-9a-zA-Z][0-9a-zA-Z-]{0,62}\.)+([0-9a-zA-Z][0-9a-zA-Z-]{0,62})\.?$") self.maxDomainSeeds = args.maxDomainSeeds self._initDomainSeedsList(args.domainSeeds) def _initDomainSeedsList(self, domainSeedsFile): fp = open(domainSeedsFile, 'r+') urlList = fp.readlines() for url in urlList: formattedUrl = self._formatUrl(url) if len(formattedUrl) > 0 and len(self.unvisitedHrefs) <= self.maxDomainSeeds: self.unvisitedHrefs.append(formattedUrl) fp.close() print 'We have got %d domain feeds.'%len(self.unvisitedHrefs) def _formatUrl(self, rawValue): rawValueStr = rawValue.strip().strip('\n') if len(rawValueStr) <= 0: return '' if not self.domainPattern.match(rawValueStr): return '' if not rawValueStr.startswith('http'): value = 'http://' + rawValueStr else: value = rawValueStr return value def start(self): print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): print 'Error: Unable to open database file.\n' else: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth < self.depth+1: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks () #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt while self.threadPool.getTaskLeft(): time.sleep(8) print 'Depth %d Finish. Totally visited %d links. \n' % ( self.currentDepth, len(self.visitedHrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() self.database.close() def getAlreadyVisitedNum(self): #visitedHrefs保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedHrefs数减去待访问的链接数 return len(self.visitedHrefs) - self.threadPool.getTaskLeft() def _assignCurrentDepthTasks(self): mylock.acquire() copiedUnvisitedHrefs = deque() while self.unvisitedHrefs: copiedUnvisitedHrefs.append(self.unvisitedHrefs.popleft()) mylock.release() while copiedUnvisitedHrefs: url = copiedUnvisitedHrefs.popleft() #标注该链接已被访问,或即将被访问,防止重复访问相同链接 self.visitedHrefs.add(url) #向任务队列分配任务 self.threadPool.putTask(self._taskHandler, url) def _taskHandler(self, url): #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 webPage = WebPage(url) retry =1 if webPage.fetch(retry): print 'Visited URL : %s ' % url self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage) def _saveTaskResults(self, webPage): url, pageSource = webPage.getDatas() soup = BeautifulSoup(pageSource) image_tags = soup.find_all('img', src=True) log.error('pageSource %s' %pageSource) for image_tag in image_tags: log.error('image_tag %s' %image_tag.contents) image_tag_parent = image_tag.find_parent('a') if not image_tag_parent == None: targetURL = image_tag_parent.get('href').encode('utf8') if not targetURL.startswith('http'): targetURL = urljoin(url, targetURL)#处理相对链接的问题 adsURL = image_tag.get('src').encode('utf8') if not adsURL.startswith('http'): adsURL = urljoin(url, adsURL)#处理相对链接的问题 print "We got an ads" print "adsURL %s" %adsURL print "targetURL %s" %targetURL print "referURL %s" %url print get_tld(adsURL) print get_tld(targetURL) print get_tld(url) log.error("adsURL %s" %adsURL) log.error("targetURL %s" %targetURL) log.error("referURL %s" %url) try: self.database.saveData(adsURL, targetURL, url) except Exception, e: log.error(' URL: %s ' % url + traceback.format_exc())