def _taskHandler(self, url): # 先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 # print 'url=\t',url webPage = WebPage(url) if webPage.fetch(): self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage)
def _taskHandler(self, url): #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 # print 'url=\t',url webPage = WebPage(url) if webPage.fetch(): self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage)
def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() if flag: url, pageSource = webPage.getDatas() # 抽取小组主页的置顶贴 match_obj = REGroup.match(url) if match_obj is not None: group_id = match_obj.group(1) # 添加置顶贴的topic列表 self._addStickTopic(webPage) return True # 抽取普通讨论贴 match_obj = REDiscussion.match(url) if match_obj is not None: group_id = match_obj.group(1) start = int(match_obj.group(2)) self._addTopicLink(webPage, start) return True log.error("抓取小组讨论列表时,发现网址格式错误。Group ID: %s, URL: %s" % (self.group_id, url)) # if page reading fails self.failed_href.add(url) return False
def readWebPage(self, urlString, depth=1, isExternal=False): webPageData = self.db.websites.search( filters=all(eq('address', WebPage.parseUrl(urlString).string))).rows() pageLinks = [] result = None if len(webPageData) == 0: return result webPageData = webPageData[0] pageId = webPageData[0] depthData = self.db.session.search('depth', all(eq('website_id', pageId))) if len(depthData) > 0: depth = depthData[0][0] result = WebPage(url=webPageData[1], depth=depth, isExternal=isExternal) query = self.db.execute( 'SELECT w.{0}, r.{0} from links join websites as w on links.{1} = w.id join websites as r on links.{2} = r.id WHERE w.id = {3};' .format(self.db.websites.fields[1], self.db.links.fields[1], self.db.links.fields[2], pageId)) for row in iter(query): pageLinks.append( WebPage(url=row[1], parent=result, depth=depth + 1)) result.links = pageLinks return result
def task_handler(topic_id, seg_list): f = codecs.open('tables/TopicInfo-title.txt', 'a','utf-8') url = 'http://www.douban.com/group/topic/' + topic_id + '/' print 'Visiting: ', url webPage = WebPage(url) flag = webPage.fetch() if flag: url, pageSource = webPage.getDatas() # pageSource已经为unicode格式 page = etree.HTML(pageSource) content = page.xpath(u"/html/body/div[@id='wrapper']/div[@id='content']")[0] tmp = page.xpath(u"//table[@class='infobox']//td[@class='tablecc']") if len(tmp) == 0: # 标题没有被截断 titlenode = content.xpath("h1")[0] title = titlenode.text.strip() else: titlenode = tmp[0] title = etree.tostring(titlenode, method='text', encoding='utf-8').strip() if isinstance(title, unicode): pass else: title = title.decode("utf-8") seg_list.insert(4, title) f.write('[=]'.join(seg_list) + '\n') else: failed_set.add(topic_id) f.close()
def _taskHandler(self, url): #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 webPage = WebPage(url) tmp = webPage.fetch() if tmp: self._callback_filter(webPage) self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage)
def _taskHandler(self, url): #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理 webPage = WebPage(url) retry =1 if webPage.fetch(retry): print 'Visited URL : %s ' % url self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage)
def _taskHandler(self, url): '''以_开头的函数是放在队列里供线程提取用的''' my_web = WebPage(url) #print 'F**k', my_web.fetch() if my_web.fetch(): #print 'has visited %s' % url self._saveTaskResults(my_web) self._addUnvisitedHrefs(my_web)
def testurl(): url="http://cy.5156edu.com/cymore.html" w=WebPage(url) hrefstart="http://cy.5156edu.com" if w.fetch(): url,pageSource=w.getDatas() soup=BeautifulSoup(pageSource) results=soup.find_all('a',href=re.compile(r'/[a-z]\.html')) print len(results) for a in results: print urljoin(hrefstart,a.get('href'))
def testdb(): d=Database('testdb.sql') url="http://cy.5156edu.com/html4/31232.html" w=WebPage(url) if w.fetch(): url,pageSource=w.getDatas() soup=BeautifulSoup(pageSource) temp=soup.find('table',bgcolor='#C0C0C0') infos=temp.find_all('td') info=getInfo(url,infos) d.saveData(info)
def selfTesting(): for i in range(97,123): url = 'http://cy.5156edu.com/html2/%s.html'%chr(i) w=WebPage(url) if w.fetch(): url,pageSource=w.getDatas() soup=BeautifulSoup(pageSource) temp=soup.find('td',bgcolor='#E8F3FF') page=int(re.findall(r'\d+',temp.text)[1]) print url for i in range(2,page+1): href="%s_%d.html"%(url[:-5],i) print href
def selfTesting(self): url = 'http://www.baidu.com' print '\nVisiting www.baidu.com using directly' my_web = WebPage(url) pageSource = my_web.fetch() #测试网络链接 if pageSource == None: print 'please check your network' elif not self.isDatabaseAvaliable(): print 'please make sure you have the permission to save data: %s\n' % args.dbFile else: self._saveTaskResults(my_web) print 'save data successfully' print 'seems all is ok'
def selfTesting(self, args): url = "http://www.baidu.com/" print "\nVisiting www.baidu.com" # 测试网络,能否顺利获取百度源码 w = WebPage(url) pageSource = w.fetch() print pageSource if pageSource == None: print "Please check your network and make sure it's connected.\n" # 数据库测试 elif not self._isDatabaseAvaliable(): print "Please make sure you have the permission to save data: %s\n" % args.dbFile # 保存数据 else: self._saveTaskResults(w) print "Create logfile and database Successfully." print "Already save Baidu.com, Please check the database record." print "Seems No Problem!\n"
def _taskHandler(self, url): """ 根据指定的url,抓取网页 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() if flag: self.lock.acquire() #锁住该变量,保证操作的原子性 self.currentPeriodVisits += 1 self.lock.release() self._saveTaskResults(webPage) self._addUnvisitedGroups(webPage) return True # if page reading fails return False
def selfTesting(self, args): url = 'http://www.baidu.com/' print '\nVisiting www.baidu.com' pageSource = WebPage(url).fetch() if pageSource == None: print 'Please check your network and make sure it\'s connected.\n' elif not self._isDatabaseAvaliable(): print 'Please make sure you have the permission to save data: %s\n' % args.dbFile else: self._saveTaskResults(url, pageSource) print 'Create logfile and database Successfully.' print 'Already save Baidu.com, Please check the database record.' print 'Seems No Problem!\n'
def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() if flag: url, pageSource = webPage.getDatas() hrefs = self._getAllHrefsFromPage(url, pageSource) # 找到有效的链接 post_list = [] next_page_url = None for href in hrefs: # 只有满足讨论帖链接格式的链接才会被处理 m = regex_post_first.match(href) if self._isHttpOrHttpsProtocol(href) and m is not None: post_list.append(m.group('post_id')) # 在当前页面中查找匹配“下一页”的链接 m = regex_next_page.match(href) if m != None and (not m.group() in self.visited_href): url = m.group() print 'Add next page link: ', url self.thread_pool.putTask(self._taskHandler, url) self.visited_href.add(url) for post_id in post_list: #print "Add thread link: ", thread self.post_list.append(post_id) # 存储已经抓取的topic list self.save_thread.putTask(self._saveTopicHandler, post_list) else: log.error(u"抓取讨论帖列表时,发现网址格式错误。URL: %s" % url) # if page reading fails self.failed_href.add(url) return False
def test(): page = WebPage(url='pduch.kis.p.lodz.pl') page.downloadContent() hist = WebsiteDatabase() hist.insertWebpage(page, connection=True) if not hist.isInThisSession(page): hist.appendSession(page) hist.readWebPage('pduch.kis.p.lodz.pl') page = WebPage(url='http://www.kis.p.lodz.pl/') print hist.wasPageVisited(page)
def __init__(self, args, depth=1): self.links = [WebPage(x) for x in args.url] self.depth = depth self.historyDb = WebsiteDatabase() self.done = False self.options = args self.results = {link.url.domain: Result() for link in self.links} self.cloudIndexer = CloudSearchIndexer.forDomainIndex("websites") if args.graph or args.rank: self.webGraph = Graph(distance=30.0) for link in self.links: self.webGraph.add_node(link.url.domain, radius=15, fill=(1, 0, 0, 0.5))
def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() match_obj = RETopic.match(url) match_obj2 = REComment.match(url) if flag: if match_obj is not None: topic_id = match_obj.group(1) topic = Topic(topic_id, self.groupID) comment_list = topic.parse(webPage, True) # First page parsing self.topicDict[topic_id] = topic # 保存到文件 self.saveThread.putTask(self._save_handler, comment_list, topic = topic) # 如果 elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) # 抽取非第一页的评论数据 if topic_id in self.topicDict: topic = self.topicDict[topic_id] if topic is None: log.error('未知程序错误:该topic已经抓取结束,已释放相关内存,topic id:%s' % topic_id) return False else: log.error('未知程序错误:在topicDict字典中找不到topic id: %s' % topic_id) self.failed.add(topic_id) self.finished.add(topic_id) return False comment_list = topic.parse(webPage, False) # non-firstpage parsing # 保存到文件 self.saveThread.putTask(self._save_handler, comment_list, topic = None) else: #pdb.set_trace() log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID)) # 判断抓取是否结束,如果结束,则释放dict内存 # 这个很重要,因为随着topic数量增多,内存会占很多 if topic.isComplete(): self.topicDict[topic_id] = None self.finished.add(topic_id) log.info('Topic: %s 抓取结束。' % topic_id) self.visitedHref.add(url) return True else: # 处理抓取失败的网页集合 # 只要一个网页抓取失败,则加入到finished if match_obj is not None: # 讨论贴的第一页就没有抓到,则将其列入finished名单中 topic_id = match_obj.group(1) elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) else: log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID)) # 添加抓取失败的topic id和标记抓取结束的topic self.failed.add(topic_id) self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息 self.visitedHref.add(url) return False
def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() match_obj = RETopic.match(url) match_obj2 = REComment.match(url) if flag: if match_obj is not None: topic_id = match_obj.group(1) topic = Topic(topic_id, self.group_id) comment_list = topic.parse(webPage, isFirstPage = True) # First page parsing self.topic_dict[topic_id] = topic # 保存到单个文件(已废弃不用) #self.save_thread.putTask(self._saveHandler, comment_list, topic = topic) elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) # 抽取非第一页的评论数据 if topic_id in self.topic_dict: topic = self.topic_dict[topic_id] if topic is None: log.error('未知程序错误:结束topic id为%s的抽取,释放内存。' % topic_id) self.topic_dict[topic_id] = None return False else: # 这里的含义为:必须先处理第一页的评论,否则该topic_id不会作为self.topic_dict的键出现 log.error('错误:必须先抽取第一页的评论数据:topic id: %s' % topic_id) self.failed.add(topic_id) self.finished.add(topic_id) return False comment_list = topic.parse(webPage, isFirstPage = False) # non-firstpage parsing # 保存到单个文件(已废弃不用) #self.save_thread.putTask(self._saveHandler, comment_list, topic = None) else: #pdb.set_trace() log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id)) # 判断抓取是否结束,如果结束,则释放dict内存 # 这个很重要,因为随着topic数量增多,内存会占很多 if topic.isComplete(): self.save_thread.putTask(self._saveTopicHandler, self.topic_dict, topic_id) #self.topic_dict[topic_id] = None # 释放资源 self.finished.add(topic_id) log.info('Topic: %s 抓取结束。' % topic_id) self.visited_href.add(url) return True else: # 处理抓取失败的网页集合 # 只要一个网页抓取失败,则加入到finished if match_obj is not None: # 讨论贴的第一页就没有抓到,则将其列入finished名单中 topic_id = match_obj.group(1) elif match_obj2 is not None: topic_id = match_obj2.group(1) start = int(match_obj2.group(2)) else: log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id)) # 添加抓取失败的topic id和标记抓取结束的topic self.failed.add(topic_id) self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息 self.visited_href.add(url) return False
def _taskHandler(self, url): webPage = WebPage(url) if webPage.fetch(): self._saveTaskResults(webPage) self._addUnvisitedHrefs(webPage)
try: info.append(infos[i].text) except Exception,e: print e return info def extractUrls(filename): try: f=open(filename,"rb") s=f.read() f.close() except Exception,e: print e else: urls=re.findall(r'http://cy.5156edu.com/html4/\d+.html',s) for url in urls: w=WebPage(url) d=Database("data.sql") if w.fetch(): try: href,pageSource=w.getDatas() soup=BeautifulSoup(pageSource) temp=soup.find('table',bgcolor='#C0C0C0') infos=temp.find_all("td") info=getInfo(href,infos) d.saveData(info) except Exception,e: print e if __name__ == '__main__': extractUrls("spider.log")
def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() m = regex_post.match(url) if m == None: log.info(u'Post链接格式错误:%s in Group: %s.' % (url, self.section_id)) return True else: log.info(u'访问:' + url) comment_page_index = int(m.group('page_index')) post_id = m.group('post_id') if flag: if comment_page_index == 1: # 首页评论 post = Post(post_id, self.section_id) # 解析讨论帖的第一个页:包括原帖内容和评论内容 comment_list = post.parse(webPage, isFirstPage = True) # First page parsing self.post_dict[post_id] = post self.next_page[post_id] = 2 elif comment_page_index > 1: # 抽取非第一页的评论数据 if post_id in self.post_dict: post = self.post_dict[post_id] else: # 这里的含义为:必须先处理第一页的评论,否则该post_id不会作为self.topic_dict的键出现 log.error(u'错误:必须先抽取第一页的评论数据:post id: %s' % post_id) self.failed.add(topic_id) self.finished.add(topic_id) return False if post is None: log.error(u'未知程序错误:结束post id为%s的抽取,释放内存。' % post_id) self.post_dict[post_id] = post return False comment_list = post.parse(webPage, isFirstPage = False) # non-firstpage parsing else: log.info(u'Post链接格式错误:%s in Group: %s.' % (url, self.section_id)) # 判断抓取是否结束,如果结束,则释放dict内存 # 这个很重要,因为随着topic数量增多,内存会占很多 if post.is_complete(): self.save_thread.putTask(self._saveTopicHandler, self.post_dict, post_id) self.finished.add(post_id) log.info(u'Topic: %s 抓取结束。' % post_id) self.visited_href.add(url) return True else: # 处理抓取失败的网页集合,只要一个网页抓取失败,则加入到finished # 添加抓取失败的post id和标记抓取结束的post self.failed.add(post_id) self.finished.add(post_id) # 有可能已经记录了一些某些topic的信息 self.visited_href.add(url) return False