예제 #1
0
 def _taskHandler(self, url):
     # 先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
     # print 'url=\t',url
     webPage = WebPage(url)
     if webPage.fetch():
         self._saveTaskResults(webPage)
         self._addUnvisitedHrefs(webPage)
예제 #2
0
 def _taskHandler(self, url):
     #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
     # print 'url=\t',url
     webPage = WebPage(url)
     if webPage.fetch():
         self._saveTaskResults(webPage)
         self._addUnvisitedHrefs(webPage)
예제 #3
0
 def _taskHandler(self, url):
     """ 根据指定的url,抓取网页,并进行相应的访问控制
     """
     print "Visiting : " + url
     webPage = WebPage(url)
     # 抓取页面内容
     flag = webPage.fetch()
     if flag:
         url, pageSource = webPage.getDatas()
         # 抽取小组主页的置顶贴
         match_obj = REGroup.match(url)
         if match_obj is not None:
             group_id = match_obj.group(1)
             # 添加置顶贴的topic列表
             self._addStickTopic(webPage)
             return True
         
         # 抽取普通讨论贴
         match_obj = REDiscussion.match(url)
         if match_obj is not None:
             group_id = match_obj.group(1)
             start = int(match_obj.group(2))
             
             self._addTopicLink(webPage, start)
             return True
             
         log.error("抓取小组讨论列表时,发现网址格式错误。Group ID: %s, URL: %s" % (self.group_id, url))
         
     # if page reading fails
     self.failed_href.add(url)
     return False
예제 #4
0
    def readWebPage(self, urlString, depth=1, isExternal=False):
        webPageData = self.db.websites.search(
            filters=all(eq('address',
                           WebPage.parseUrl(urlString).string))).rows()
        pageLinks = []
        result = None

        if len(webPageData) == 0:
            return result

        webPageData = webPageData[0]
        pageId = webPageData[0]

        depthData = self.db.session.search('depth',
                                           all(eq('website_id', pageId)))
        if len(depthData) > 0:
            depth = depthData[0][0]

        result = WebPage(url=webPageData[1],
                         depth=depth,
                         isExternal=isExternal)

        query = self.db.execute(
            'SELECT w.{0}, r.{0} from links join websites as w on links.{1} = w.id join websites as r on links.{2} = r.id WHERE w.id = {3};'
            .format(self.db.websites.fields[1], self.db.links.fields[1],
                    self.db.links.fields[2], pageId))

        for row in iter(query):
            pageLinks.append(
                WebPage(url=row[1], parent=result, depth=depth + 1))
        result.links = pageLinks

        return result
예제 #5
0
def task_handler(topic_id, seg_list):
    f = codecs.open('tables/TopicInfo-title.txt', 'a','utf-8')
    url = 'http://www.douban.com/group/topic/' + topic_id + '/'
    print 'Visiting: ', url
    webPage = WebPage(url)
    flag = webPage.fetch()
    
    if flag:
        url, pageSource = webPage.getDatas() # pageSource已经为unicode格式
        page = etree.HTML(pageSource)
        content = page.xpath(u"/html/body/div[@id='wrapper']/div[@id='content']")[0]
        tmp = page.xpath(u"//table[@class='infobox']//td[@class='tablecc']")
        if len(tmp) == 0:
            # 标题没有被截断
            titlenode = content.xpath("h1")[0]
            title = titlenode.text.strip()
        else:
            titlenode = tmp[0]
            title = etree.tostring(titlenode, method='text', encoding='utf-8').strip()
            
        if isinstance(title, unicode):
            pass
        else:
            title = title.decode("utf-8")
        seg_list.insert(4, title)
        f.write('[=]'.join(seg_list) + '\n')
    else:
        failed_set.add(topic_id)
    
    f.close()
예제 #6
0
 def _taskHandler(self, url):
     #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
     webPage = WebPage(url)
     tmp = webPage.fetch()
     if tmp:
         self._callback_filter(webPage)
         self._saveTaskResults(webPage)
         self._addUnvisitedHrefs(webPage)
예제 #7
0
 def _taskHandler(self, url):
     #先拿网页源码,再保存,两个都是高阻塞的操作,交给线程处理
     webPage = WebPage(url)
     retry =1
     if webPage.fetch(retry):
         print 'Visited URL : %s ' % url
         self._saveTaskResults(webPage)
         self._addUnvisitedHrefs(webPage)
예제 #8
0
 def _taskHandler(self, url):
     '''以_开头的函数是放在队列里供线程提取用的'''
     my_web = WebPage(url)
     #print 'F**k', my_web.fetch()
     if my_web.fetch():
         #print 'has visited %s' % url
         self._saveTaskResults(my_web)
         self._addUnvisitedHrefs(my_web)
예제 #9
0
 def _taskHandler(self, url):
     '''以_开头的函数是放在队列里供线程提取用的'''
     my_web = WebPage(url)
     #print 'F**k', my_web.fetch()
     if my_web.fetch():
         #print 'has visited %s' % url
         self._saveTaskResults(my_web)
         self._addUnvisitedHrefs(my_web)
예제 #10
0
def testurl():
	url="http://cy.5156edu.com/cymore.html"
	w=WebPage(url)
	hrefstart="http://cy.5156edu.com"
	if w.fetch():
		url,pageSource=w.getDatas()
		soup=BeautifulSoup(pageSource)
		results=soup.find_all('a',href=re.compile(r'/[a-z]\.html'))
		print len(results)
		for a in results:
			print urljoin(hrefstart,a.get('href'))
예제 #11
0
def testdb():
	d=Database('testdb.sql')
	url="http://cy.5156edu.com/html4/31232.html"
	w=WebPage(url)
	if w.fetch():
		url,pageSource=w.getDatas()
		soup=BeautifulSoup(pageSource)
		temp=soup.find('table',bgcolor='#C0C0C0')
        infos=temp.find_all('td')
        info=getInfo(url,infos)
        d.saveData(info)
예제 #12
0
def selfTesting():
	for i in range(97,123):
	    url = 'http://cy.5156edu.com/html2/%s.html'%chr(i)
	    w=WebPage(url)
	    if w.fetch():
	    	url,pageSource=w.getDatas()
	        soup=BeautifulSoup(pageSource)
	        temp=soup.find('td',bgcolor='#E8F3FF')
	        page=int(re.findall(r'\d+',temp.text)[1])
	        print url
	        for i in range(2,page+1):
	        	href="%s_%d.html"%(url[:-5],i)
	        	print href
예제 #13
0
 def selfTesting(self):
     url = 'http://www.baidu.com'
     print '\nVisiting www.baidu.com using directly'
     my_web = WebPage(url)
     pageSource = my_web.fetch()
     #测试网络链接
     if pageSource == None:
         print 'please check your network'
     elif not self.isDatabaseAvaliable():
         print 'please make sure you have the permission to save data: %s\n' % args.dbFile
     else:
         self._saveTaskResults(my_web)
         print 'save data successfully'
         print 'seems all is ok'
예제 #14
0
 def selfTesting(self):
     url = 'http://www.baidu.com'
     print '\nVisiting www.baidu.com using directly'
     my_web = WebPage(url)
     pageSource = my_web.fetch()
     #测试网络链接
     if pageSource == None:
         print 'please check your network'
     elif not self.isDatabaseAvaliable():
         print 'please make sure you have the permission to save data: %s\n' % args.dbFile
     else:
         self._saveTaskResults(my_web)
         print 'save data successfully'
         print 'seems all is ok'
예제 #15
0
 def selfTesting(self, args):
     url = "http://www.baidu.com/"
     print "\nVisiting www.baidu.com"
     # 测试网络,能否顺利获取百度源码
     w = WebPage(url)
     pageSource = w.fetch()
     print pageSource
     if pageSource == None:
         print "Please check your network and make sure it's connected.\n"
     # 数据库测试
     elif not self._isDatabaseAvaliable():
         print "Please make sure you have the permission to save data: %s\n" % args.dbFile
     # 保存数据
     else:
         self._saveTaskResults(w)
         print "Create logfile and database Successfully."
         print "Already save Baidu.com, Please check the database record."
         print "Seems No Problem!\n"
예제 #16
0
파일: crawler.py 프로젝트: hitalex/crawler
 def _taskHandler(self, url):
     """ 根据指定的url,抓取网页
     """
     print "Visiting : " + url
     webPage = WebPage(url)
     # 抓取页面内容
     flag = webPage.fetch()
     if flag:
         self.lock.acquire() #锁住该变量,保证操作的原子性
         self.currentPeriodVisits += 1
         self.lock.release()
         
         self._saveTaskResults(webPage)
         self._addUnvisitedGroups(webPage)
         return True
         
     # if page reading fails
     return False
예제 #17
0
 def selfTesting(self, args):
     url = 'http://www.baidu.com/'
     print '\nVisiting www.baidu.com'
     pageSource = WebPage(url).fetch()
     if pageSource == None:
         print 'Please check your network and make sure it\'s connected.\n'
     elif not self._isDatabaseAvaliable():
         print 'Please make sure you have the permission to save data: %s\n' % args.dbFile
     else:
         self._saveTaskResults(url, pageSource)
         print 'Create logfile and database Successfully.'
         print 'Already save Baidu.com, Please check the database record.'
         print 'Seems No Problem!\n'
    def _taskHandler(self, url):
        """ 根据指定的url,抓取网页,并进行相应的访问控制
        """
        print "Visiting : " + url
        webPage = WebPage(url)
        # 抓取页面内容
        flag = webPage.fetch()
        if flag:
            url, pageSource = webPage.getDatas()
            hrefs = self._getAllHrefsFromPage(url, pageSource)
            # 找到有效的链接
            post_list = []
            next_page_url = None
            for href in hrefs:
                # 只有满足讨论帖链接格式的链接才会被处理
                m = regex_post_first.match(href)
                if self._isHttpOrHttpsProtocol(href) and m is not None:
                    post_list.append(m.group('post_id'))

                # 在当前页面中查找匹配“下一页”的链接
                m = regex_next_page.match(href)
                if m != None and (not m.group() in self.visited_href):
                    url = m.group()
                    print 'Add next page link: ', url
                    self.thread_pool.putTask(self._taskHandler, url)
                    self.visited_href.add(url)
                                
            for post_id in post_list:
                #print "Add thread link: ", thread
                self.post_list.append(post_id)
                
            # 存储已经抓取的topic list
            self.save_thread.putTask(self._saveTopicHandler, post_list)
        else:                
            log.error(u"抓取讨论帖列表时,发现网址格式错误。URL: %s" % url)
            # if page reading fails
            self.failed_href.add(url)
            return False
예제 #19
0
def test():
    page = WebPage(url='pduch.kis.p.lodz.pl')
    page.downloadContent()
    hist = WebsiteDatabase()
    hist.insertWebpage(page, connection=True)
    if not hist.isInThisSession(page):
        hist.appendSession(page)
    hist.readWebPage('pduch.kis.p.lodz.pl')
    page = WebPage(url='http://www.kis.p.lodz.pl/')
    print hist.wasPageVisited(page)
예제 #20
0
    def __init__(self, args, depth=1):
        self.links = [WebPage(x) for x in args.url]
        self.depth = depth
        self.historyDb = WebsiteDatabase()
        self.done = False
        self.options = args
        self.results = {link.url.domain: Result() for link in self.links}

        self.cloudIndexer = CloudSearchIndexer.forDomainIndex("websites")

        if args.graph or args.rank:
            self.webGraph = Graph(distance=30.0)
            for link in self.links:
                self.webGraph.add_node(link.url.domain,
                                       radius=15,
                                       fill=(1, 0, 0, 0.5))
예제 #21
0
 def _taskHandler(self, url):
     """ 根据指定的url,抓取网页,并进行相应的访问控制
     """      
     print "Visiting : " + url
     webPage = WebPage(url)
     
     # 抓取页面内容
     flag = webPage.fetch()
     match_obj = RETopic.match(url)
     match_obj2 = REComment.match(url)
     
     if flag:
         if match_obj is not None:
             topic_id = match_obj.group(1)
             topic = Topic(topic_id, self.groupID)
             comment_list = topic.parse(webPage, True) # First page parsing
             self.topicDict[topic_id] = topic
             # 保存到文件
             self.saveThread.putTask(self._save_handler, comment_list, topic = topic)
             # 如果
         elif match_obj2 is not None:
             topic_id = match_obj2.group(1)
             start = int(match_obj2.group(2))
             # 抽取非第一页的评论数据
             if topic_id in self.topicDict:
                 topic = self.topicDict[topic_id]
                 if topic is None:
                     log.error('未知程序错误:该topic已经抓取结束,已释放相关内存,topic id:%s' % topic_id)
                     return False
             else:
                 log.error('未知程序错误:在topicDict字典中找不到topic id: %s' % topic_id)
                 self.failed.add(topic_id)
                 self.finished.add(topic_id)
                 return False
                 
             comment_list = topic.parse(webPage, False) # non-firstpage parsing
             # 保存到文件
             self.saveThread.putTask(self._save_handler, comment_list, topic = None)
         else:
             #pdb.set_trace()
             log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID))
         # 判断抓取是否结束,如果结束,则释放dict内存
         # 这个很重要,因为随着topic数量增多,内存会占很多
         if topic.isComplete():
             self.topicDict[topic_id] = None
             self.finished.add(topic_id)
             log.info('Topic: %s 抓取结束。' % topic_id)
             
         self.visitedHref.add(url)
         return True
     else:
         # 处理抓取失败的网页集合
         # 只要一个网页抓取失败,则加入到finished
         if match_obj is not None:
             # 讨论贴的第一页就没有抓到,则将其列入finished名单中
             topic_id = match_obj.group(1)
         elif match_obj2 is not None:
             topic_id = match_obj2.group(1)
             start = int(match_obj2.group(2))
         else:
             log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.groupID))
         
         # 添加抓取失败的topic id和标记抓取结束的topic
         self.failed.add(topic_id)
         self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息
         self.visitedHref.add(url)
         return False
 def _taskHandler(self, url):
     """ 根据指定的url,抓取网页,并进行相应的访问控制
     """      
     print "Visiting : " + url
     webPage = WebPage(url)
     
     # 抓取页面内容
     flag = webPage.fetch()
     match_obj = RETopic.match(url)
     match_obj2 = REComment.match(url)
     
     if flag:
         if match_obj is not None:
             topic_id = match_obj.group(1)
             topic = Topic(topic_id, self.group_id)
             comment_list = topic.parse(webPage, isFirstPage = True) # First page parsing
             self.topic_dict[topic_id] = topic
             # 保存到单个文件(已废弃不用)
             #self.save_thread.putTask(self._saveHandler, comment_list, topic = topic)
         elif match_obj2 is not None:
             topic_id = match_obj2.group(1)
             start = int(match_obj2.group(2))
             # 抽取非第一页的评论数据
             if topic_id in self.topic_dict:
                 topic = self.topic_dict[topic_id]
                 if topic is None:
                     log.error('未知程序错误:结束topic id为%s的抽取,释放内存。' % topic_id)
                     self.topic_dict[topic_id] = None
                     return False
             else:
                 # 这里的含义为:必须先处理第一页的评论,否则该topic_id不会作为self.topic_dict的键出现
                 log.error('错误:必须先抽取第一页的评论数据:topic id: %s' % topic_id)
                 self.failed.add(topic_id)
                 self.finished.add(topic_id)
                 return False
                 
             comment_list = topic.parse(webPage, isFirstPage = False) # non-firstpage parsing
             # 保存到单个文件(已废弃不用)
             #self.save_thread.putTask(self._saveHandler, comment_list, topic = None)
         else:
             #pdb.set_trace()
             log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id))
         # 判断抓取是否结束,如果结束,则释放dict内存
         # 这个很重要,因为随着topic数量增多,内存会占很多
         if topic.isComplete():
             self.save_thread.putTask(self._saveTopicHandler, self.topic_dict, topic_id)
             #self.topic_dict[topic_id] = None        # 释放资源
             self.finished.add(topic_id)
             log.info('Topic: %s 抓取结束。' % topic_id)
             
         self.visited_href.add(url)
         return True
     else:
         # 处理抓取失败的网页集合
         # 只要一个网页抓取失败,则加入到finished
         if match_obj is not None:
             # 讨论贴的第一页就没有抓到,则将其列入finished名单中
             topic_id = match_obj.group(1)
         elif match_obj2 is not None:
             topic_id = match_obj2.group(1)
             start = int(match_obj2.group(2))
         else:
             log.info('Topic链接格式错误:%s in Group: %s.' % (url, self.group_id))
         
         # 添加抓取失败的topic id和标记抓取结束的topic
         self.failed.add(topic_id)
         self.finished.add(topic_id) # 有可能已经记录了一些某些topic的信息
         self.visited_href.add(url)
         return False
예제 #23
0
 def _taskHandler(self, url):
     webPage = WebPage(url)
     if webPage.fetch():
         self._saveTaskResults(webPage)
         self._addUnvisitedHrefs(webPage)
예제 #24
0
        try:
            info.append(infos[i].text)
        except Exception,e:
            print e
    return info

def extractUrls(filename):
	try:
		f=open(filename,"rb")
		s=f.read()
		f.close()
	except Exception,e:
		print e
	else:
		urls=re.findall(r'http://cy.5156edu.com/html4/\d+.html',s)
		for url in urls:
			w=WebPage(url)
			d=Database("data.sql")
			if w.fetch():
				try:
					href,pageSource=w.getDatas()
					soup=BeautifulSoup(pageSource)
					temp=soup.find('table',bgcolor='#C0C0C0')
					infos=temp.find_all("td")
					info=getInfo(href,infos)
					d.saveData(info)
				except Exception,e:
					print e

if __name__ == '__main__':
	extractUrls("spider.log")
    def _taskHandler(self, url):
        """ 根据指定的url,抓取网页,并进行相应的访问控制
        """      
        print "Visiting : " + url
        webPage = WebPage(url)
        
        # 抓取页面内容
        flag = webPage.fetch()
        m = regex_post.match(url)
        if m == None:
            log.info(u'Post链接格式错误:%s in Group: %s.' % (url, self.section_id))
            return True
        else:
            log.info(u'访问:' + url)
            
        comment_page_index = int(m.group('page_index'))
        post_id = m.group('post_id')
        if flag:
            if comment_page_index == 1: # 首页评论
                post = Post(post_id, self.section_id)
                # 解析讨论帖的第一个页:包括原帖内容和评论内容
                comment_list = post.parse(webPage, isFirstPage = True) # First page parsing
                self.post_dict[post_id] = post
                self.next_page[post_id] = 2
                
            elif comment_page_index > 1:
                # 抽取非第一页的评论数据
                if post_id in self.post_dict:
                    post = self.post_dict[post_id]
                else:
                    # 这里的含义为:必须先处理第一页的评论,否则该post_id不会作为self.topic_dict的键出现
                    log.error(u'错误:必须先抽取第一页的评论数据:post id: %s' % post_id)
                    self.failed.add(topic_id)
                    self.finished.add(topic_id)
                    return False
                
                if post is None:
                    log.error(u'未知程序错误:结束post id为%s的抽取,释放内存。' % post_id)
                    self.post_dict[post_id] = post
                    return False
                    
                comment_list = post.parse(webPage, isFirstPage = False) # non-firstpage parsing
            else:
                log.info(u'Post链接格式错误:%s in Group: %s.' % (url, self.section_id))

            # 判断抓取是否结束,如果结束,则释放dict内存
            # 这个很重要,因为随着topic数量增多,内存会占很多
            if post.is_complete():
                self.save_thread.putTask(self._saveTopicHandler, self.post_dict, post_id)
                self.finished.add(post_id)
                log.info(u'Topic: %s 抓取结束。' % post_id)
                
            self.visited_href.add(url)
            return True
        else:
            # 处理抓取失败的网页集合,只要一个网页抓取失败,则加入到finished
            # 添加抓取失败的post id和标记抓取结束的post
            self.failed.add(post_id)
            self.finished.add(post_id) # 有可能已经记录了一些某些topic的信息
            self.visited_href.add(url)
            return False