Пример #1
0
 def _taskHandler(self, url):
     """ 根据指定的url,抓取网页,并进行相应的访问控制
     """
     print "Visiting : " + url
     webPage = WebPage(url)
     # 抓取页面内容
     flag = webPage.fetch()
     if flag:
         url, pageSource = webPage.getDatas()
         # 抽取小组主页的置顶贴
         match_obj = REGroup.match(url)
         if match_obj is not None:
             group_id = match_obj.group(1)
             # 添加置顶贴的topic列表
             self._addStickTopic(webPage)
             return True
         
         # 抽取普通讨论贴
         match_obj = REDiscussion.match(url)
         if match_obj is not None:
             group_id = match_obj.group(1)
             start = int(match_obj.group(2))
             
             self._addTopicLink(webPage, start)
             return True
             
         log.error("抓取小组讨论列表时,发现网址格式错误。Group ID: %s, URL: %s" % (self.group_id, url))
         
     # if page reading fails
     self.failed_href.add(url)
     return False
Пример #2
0
def task_handler(topic_id, seg_list):
    f = codecs.open('tables/TopicInfo-title.txt', 'a','utf-8')
    url = 'http://www.douban.com/group/topic/' + topic_id + '/'
    print 'Visiting: ', url
    webPage = WebPage(url)
    flag = webPage.fetch()
    
    if flag:
        url, pageSource = webPage.getDatas() # pageSource已经为unicode格式
        page = etree.HTML(pageSource)
        content = page.xpath(u"/html/body/div[@id='wrapper']/div[@id='content']")[0]
        tmp = page.xpath(u"//table[@class='infobox']//td[@class='tablecc']")
        if len(tmp) == 0:
            # 标题没有被截断
            titlenode = content.xpath("h1")[0]
            title = titlenode.text.strip()
        else:
            titlenode = tmp[0]
            title = etree.tostring(titlenode, method='text', encoding='utf-8').strip()
            
        if isinstance(title, unicode):
            pass
        else:
            title = title.decode("utf-8")
        seg_list.insert(4, title)
        f.write('[=]'.join(seg_list) + '\n')
    else:
        failed_set.add(topic_id)
    
    f.close()
Пример #3
0
def testurl():
	url="http://cy.5156edu.com/cymore.html"
	w=WebPage(url)
	hrefstart="http://cy.5156edu.com"
	if w.fetch():
		url,pageSource=w.getDatas()
		soup=BeautifulSoup(pageSource)
		results=soup.find_all('a',href=re.compile(r'/[a-z]\.html'))
		print len(results)
		for a in results:
			print urljoin(hrefstart,a.get('href'))
Пример #4
0
def testdb():
	d=Database('testdb.sql')
	url="http://cy.5156edu.com/html4/31232.html"
	w=WebPage(url)
	if w.fetch():
		url,pageSource=w.getDatas()
		soup=BeautifulSoup(pageSource)
		temp=soup.find('table',bgcolor='#C0C0C0')
        infos=temp.find_all('td')
        info=getInfo(url,infos)
        d.saveData(info)
Пример #5
0
def selfTesting():
	for i in range(97,123):
	    url = 'http://cy.5156edu.com/html2/%s.html'%chr(i)
	    w=WebPage(url)
	    if w.fetch():
	    	url,pageSource=w.getDatas()
	        soup=BeautifulSoup(pageSource)
	        temp=soup.find('td',bgcolor='#E8F3FF')
	        page=int(re.findall(r'\d+',temp.text)[1])
	        print url
	        for i in range(2,page+1):
	        	href="%s_%d.html"%(url[:-5],i)
	        	print href
    def _taskHandler(self, url):
        """ 根据指定的url,抓取网页,并进行相应的访问控制
        """
        print "Visiting : " + url
        webPage = WebPage(url)
        # 抓取页面内容
        flag = webPage.fetch()
        if flag:
            url, pageSource = webPage.getDatas()
            hrefs = self._getAllHrefsFromPage(url, pageSource)
            # 找到有效的链接
            post_list = []
            next_page_url = None
            for href in hrefs:
                # 只有满足讨论帖链接格式的链接才会被处理
                m = regex_post_first.match(href)
                if self._isHttpOrHttpsProtocol(href) and m is not None:
                    post_list.append(m.group('post_id'))

                # 在当前页面中查找匹配“下一页”的链接
                m = regex_next_page.match(href)
                if m != None and (not m.group() in self.visited_href):
                    url = m.group()
                    print 'Add next page link: ', url
                    self.thread_pool.putTask(self._taskHandler, url)
                    self.visited_href.add(url)
                                
            for post_id in post_list:
                #print "Add thread link: ", thread
                self.post_list.append(post_id)
                
            # 存储已经抓取的topic list
            self.save_thread.putTask(self._saveTopicHandler, post_list)
        else:                
            log.error(u"抓取讨论帖列表时,发现网址格式错误。URL: %s" % url)
            # if page reading fails
            self.failed_href.add(url)
            return False
Пример #7
0
        try:
            info.append(infos[i].text)
        except Exception,e:
            print e
    return info

def extractUrls(filename):
	try:
		f=open(filename,"rb")
		s=f.read()
		f.close()
	except Exception,e:
		print e
	else:
		urls=re.findall(r'http://cy.5156edu.com/html4/\d+.html',s)
		for url in urls:
			w=WebPage(url)
			d=Database("data.sql")
			if w.fetch():
				try:
					href,pageSource=w.getDatas()
					soup=BeautifulSoup(pageSource)
					temp=soup.find('table',bgcolor='#C0C0C0')
					infos=temp.find_all("td")
					info=getInfo(href,infos)
					d.saveData(info)
				except Exception,e:
					print e

if __name__ == '__main__':
	extractUrls("spider.log")