Python WebPage.getDatas примеры использования

Язык программирования: Python

Пространство имен/Пакет: webPage

Класс/Тип: WebPage

Метод/Функция: getDatas

Примеров на hotexamples.com: 7

Python WebPage.getDatas - 7 примеров найдено. Это лучшие примеры Python кода для webPage.WebPage.getDatas, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

WebPage(8)

getDatas(7)

fetch(4)

downloadContent(1)

links(1)

parseUrl(1)

Пример #1

Показать файл

Файл: topic_crawler.py Проект: hitalex/douban-group-crawler

 def _taskHandler(self, url):
     """ 根据指定的url，抓取网页，并进行相应的访问控制
     """
     print "Visiting : " + url
     webPage = WebPage(url)
     # 抓取页面内容
     flag = webPage.fetch()
     if flag:
         url, pageSource = webPage.getDatas()
         # 抽取小组主页的置顶贴
         match_obj = REGroup.match(url)
         if match_obj is not None:
             group_id = match_obj.group(1)
             # 添加置顶贴的topic列表
             self._addStickTopic(webPage)
             return True
         
         # 抽取普通讨论贴
         match_obj = REDiscussion.match(url)
         if match_obj is not None:
             group_id = match_obj.group(1)
             start = int(match_obj.group(2))
             
             self._addTopicLink(webPage, start)
             return True
             
         log.error("抓取小组讨论列表时，发现网址格式错误。Group ID: %s, URL: %s" % (self.group_id, url))
         
     # if page reading fails
     self.failed_href.add(url)
     return False

Пример #2

Показать файл

Файл: crawl_title.py Проект: hitalex/crawler

def task_handler(topic_id, seg_list):
    f = codecs.open('tables/TopicInfo-title.txt', 'a','utf-8')
    url = 'http://www.douban.com/group/topic/' + topic_id + '/'
    print 'Visiting: ', url
    webPage = WebPage(url)
    flag = webPage.fetch()
    
    if flag:
        url, pageSource = webPage.getDatas() # pageSource已经为unicode格式
        page = etree.HTML(pageSource)
        content = page.xpath(u"/html/body/div[@id='wrapper']/div[@id='content']")[0]
        tmp = page.xpath(u"//table[@class='infobox']//td[@class='tablecc']")
        if len(tmp) == 0:
            # 标题没有被截断
            titlenode = content.xpath("h1")[0]
            title = titlenode.text.strip()
        else:
            titlenode = tmp[0]
            title = etree.tostring(titlenode, method='text', encoding='utf-8').strip()
            
        if isinstance(title, unicode):
            pass
        else:
            title = title.decode("utf-8")
        seg_list.insert(4, title)
        f.write('[=]'.join(seg_list) + '\n')
    else:
        failed_set.add(topic_id)
    
    f.close()

Пример #3

Показать файл

Файл: test.py Проект: jiangpengcheng/cidian

def testurl():
	url="http://cy.5156edu.com/cymore.html"
	w=WebPage(url)
	hrefstart="http://cy.5156edu.com"
	if w.fetch():
		url,pageSource=w.getDatas()
		soup=BeautifulSoup(pageSource)
		results=soup.find_all('a',href=re.compile(r'/[a-z]\.html'))
		print len(results)
		for a in results:
			print urljoin(hrefstart,a.get('href'))

Пример #4

Показать файл

Файл: test.py Проект: jiangpengcheng/cidian

def testdb():
	d=Database('testdb.sql')
	url="http://cy.5156edu.com/html4/31232.html"
	w=WebPage(url)
	if w.fetch():
		url,pageSource=w.getDatas()
		soup=BeautifulSoup(pageSource)
		temp=soup.find('table',bgcolor='#C0C0C0')
        infos=temp.find_all('td')
        info=getInfo(url,infos)
        d.saveData(info)

Пример #5

Показать файл

Файл: test.py Проект: jiangpengcheng/cidian

def selfTesting():
	for i in range(97,123):
	    url = 'http://cy.5156edu.com/html2/%s.html'%chr(i)
	    w=WebPage(url)
	    if w.fetch():
	    	url,pageSource=w.getDatas()
	        soup=BeautifulSoup(pageSource)
	        temp=soup.find('td',bgcolor='#E8F3FF')
	        page=int(re.findall(r'\d+',temp.text)[1])
	        print url
	        for i in range(2,page+1):
	        	href="%s_%d.html"%(url[:-5],i)
	        	print href

Пример #6

Показать файл

Файл: post_id_crawler.py Проект: hitalex/tianya-forum-crawler

    def _taskHandler(self, url):
        """ 根据指定的url，抓取网页，并进行相应的访问控制
        """
        print "Visiting : " + url
        webPage = WebPage(url)
        # 抓取页面内容
        flag = webPage.fetch()
        if flag:
            url, pageSource = webPage.getDatas()
            hrefs = self._getAllHrefsFromPage(url, pageSource)
            # 找到有效的链接
            post_list = []
            next_page_url = None
            for href in hrefs:
                # 只有满足讨论帖链接格式的链接才会被处理
                m = regex_post_first.match(href)
                if self._isHttpOrHttpsProtocol(href) and m is not None:
                    post_list.append(m.group('post_id'))

                # 在当前页面中查找匹配“下一页”的链接
                m = regex_next_page.match(href)
                if m != None and (not m.group() in self.visited_href):
                    url = m.group()
                    print 'Add next page link: ', url
                    self.thread_pool.putTask(self._taskHandler, url)
                    self.visited_href.add(url)
                                
            for post_id in post_list:
                #print "Add thread link: ", thread
                self.post_list.append(post_id)
                
            # 存储已经抓取的topic list
            self.save_thread.putTask(self._saveTopicHandler, post_list)
        else:                
            log.error(u"抓取讨论帖列表时，发现网址格式错误。URL: %s" % url)
            # if page reading fails
            self.failed_href.add(url)
            return False

Пример #7

Показать файл

Файл: fix.py Проект: jiangpengcheng/cidian

        try:
            info.append(infos[i].text)
        except Exception,e:
            print e
    return info

def extractUrls(filename):
	try:
		f=open(filename,"rb")
		s=f.read()
		f.close()
	except Exception,e:
		print e
	else:
		urls=re.findall(r'http://cy.5156edu.com/html4/\d+.html',s)
		for url in urls:
			w=WebPage(url)
			d=Database("data.sql")
			if w.fetch():
				try:
					href,pageSource=w.getDatas()
					soup=BeautifulSoup(pageSource)
					temp=soup.find('table',bgcolor='#C0C0C0')
					infos=temp.find_all("td")
					info=getInfo(href,infos)
					d.saveData(info)
				except Exception,e:
					print e

if __name__ == '__main__':
	extractUrls("spider.log")