예제 #1
0
 def main(self):
     f_handler=open('out.log', 'w') 
     sys.stdout=f_handler
     page = open('page.txt', 'r')
     content = page.readline()
     start_page = int(content.strip())-1
     page.close()     
     print (self.getCurrentTime(),"开始页码",start_page)
     print (self.getCurrentTime(),"爬虫正在启动,开始爬取爱问知识人问题")
     self.total_num = self.getTotalPageNum()
     print (self.getCurrentTime(),"获取到目录页面个数",self.total_num,"个")
     if not start_page:
         start_page = self.total_num
     for x in range(1,start_page):
         print (self.getCurrentTime(),"正在抓取第",start_page-x+1,"个页面")
         try:
             self.getQuestions(start_page-x+1)
         except urllib.error.URLError as e:
             if hasattr(e, "reason"):
                 print (self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因", e.reason)
         except Exception as e:  
             print (self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因:",e)
         if start_page-x+1 < start_page:
             f=open('page.txt','w')
             f.write(str(start_page-x+1))
             print (self.getCurrentTime(),"写入新页码",start_page-x+1)
             f.close()
예제 #2
0
    def main(self):
        f_handler=open('out.log', 'w',encoding='utf-8') 
#         sys.stdout=f_handler
        page = open('page.txt', 'r')
        content = page.readline()
        start_page = int(content.strip()) - 1
        page.close()     
        print(self.getCurrentTime()+"开始页码"+str(start_page))
        print(self.getCurrentTime()+"爬虫正在启动,开始爬取爱问知识人问题")
        self.total_num = self.getTotalPageNum()
        print(self.getCurrentTime()+"获取到目录页面个数"+str(self.total_num)+"个")
        if not start_page:
            start_page = self.total_num
        print(str(start_page))
        for x in range(0,start_page):
            print(self.getCurrentTime()+"正在抓取第"+str(start_page-x+1) +"个页面")
            try:
                self.getQuestions(start_page-x+1)
            except urllib.error.HTTPError as e:
                if hasattr(e, "reason"):
                    print(self.getCurrentTime()+"某总页面内抓取或提取失败,错误原因"+e.reason)
            if start_page-x+1 < start_page:
                f=open('page.txt','w')
                f.write(str(start_page-x+1))
                print(self.getCurrentTime()+"写入新页码"+str(start_page-x+1))
                f.close()
예제 #3
0
파일: spider.py 프로젝트: Germey/iaskspider
 def main(self):
     f_handler=open('out.log', 'w') 
     sys.stdout=f_handler
     page = open('page.txt', 'r')
     content = page.readline()
     start_page = int(content.strip()) - 1
     page.close()     
     print self.getCurrentTime(),"开始页码",start_page
     print self.getCurrentTime(),"爬虫正在启动,开始爬取爱问知识人问题"
     self.total_num = self.getTotalPageNum()
     print self.getCurrentTime(),"获取到目录页面个数",self.total_num,"个"
     for x in range(1,start_page):
         print self.getCurrentTime(),"正在抓取第",start_page-x+1,"个页面"
         try:
             self.getQuestions(start_page-x+1)
         except urllib2.URLError, e:
             if hasattr(e, "reason"):
                 print self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因", e.reason
         except Exception,e:  
             print self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因:",e
예제 #4
0
 def main(self):
     f_handler = open('out.log', 'w')
     sys.stdout = f_handler
     page = open('page.txt', 'r')
     content = page.readline()
     start_page = int(content.strip()) - 1
     page.close()
     print self.getCurrentTime(), "开始页码", start_page
     print self.getCurrentTime(), "爬虫正在启动,开始爬取爱问知识人问题"
     self.total_num = self.getTotalPageNum()
     print self.getCurrentTime(), "获取到目录页面个数", self.total_num, "个"
     for x in range(1, start_page):
         print self.getCurrentTime(), "正在抓取第", start_page - x + 1, "个页面"
         try:
             self.getQuestions(start_page - x + 1)
         except urllib2.URLError, e:
             if hasattr(e, "reason"):
                 print self.getCurrentTime(), "某总页面内抓取或提取失败,错误原因", e.reason
         except Exception, e:
             print self.getCurrentTime(), "某总页面内抓取或提取失败,错误原因:", e
예제 #5
0
 def start(self):
     f_handler = open('out.log','w')
     sys.stdout = f_handler
     page = open('page.txt','r')
     content = page.readline()
     start_page = 10
     page.close()
     print self.getCurrentTime(),"开始页码",start_page
     print self.getCurrentDate(),"开始爬取"
     self.total_num = self.getTotalPageNum()
     print self.getCurrentTime(),"获取到目录页面个数",self.total_num,"个"
     if not start_page:
         start_page = self.total_num
     for x in range(1,10):
         print self.getCurrentTime(),"正在抓取第",start_page-x+1,"个页面"
         try:
             self.getQuestions(start_page-x+1)
         except urllib2.URLError, e:
             if hasattr(e, "reason"):
                 print self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因", e.reason
         except Exception,e:
             print self.getCurrentTime(),"某总页面内抓取或提取失败,错误原因:",e
예제 #6
0
 def main(self):
     f_handler = open('out.log', 'w')
     sys.stdout = f_handler
     page = open('page.txt', 'r')
     content = page.readline()
     start_page = int(content.strip()) - 1
     page.close()
     print self.getCurrentTime(), "Starting page", start_page
     print self.getCurrentTime(), "Web crawler is starting, crawles Aiwen"
     self.total_num = self.getTotalPageNum()
     print self.getCurrentTime(), "Get the number of pages: ", self.total_num
     if not start_page:
         start_page = int(self.total_num)
     for x in range(1, start_page):
         print self.getCurrentTime(), "Now crawling the ", start_page-x+1, "page"
         try:
             self.getQuestions(start_page-x+1)
         except urllib2.URLError, e:
             if hasattr(e, "reason"):
                 print self.getCurrentTime(), "Extract info failed, reason: ", e.reason
         except Exception, e:
             print self.getCurrentTime(), "Extract info failed, reason: ", e
 def main(self):
     f_handler = open('out.log', 'w')
     sys.stdout = f_handler
     page = open('page.txt', 'r')
     content = page.readline()
     start_page = int(content.strip()) - 1
     page.close()
     print self.getCurrentTime(), "开始页码", start_page
     print self.getCurrentTime(), "爬虫正在启动,开始爬取爱问知识人问题"
     self.total_num = 100  # 新版的页面总的页数为100
     print self.getCurrentTime(), "获取到目录页面个数", self.total_num, "个"
     if not start_page:
         start_page = self.total_num
     for x in range(1, start_page):
         print self.getCurrentTime(), "正在抓取第", start_page - x + 1, "个页面"
         try: # 这里页面的页码的格式不是按照正常的1 2 3 4,而是用了其他的算法产生页码
             self.getQuestions(start_page - x + 1)
         except urllib2.URLError, e:
             if hasattr(e, "reason"):
                 print self.getCurrentTime(), "某总页面内抓取或提取失败,错误原因", e.reason
         except Exception, e:
             print self.getCurrentTime(), "某总页面内抓取或提取失败,错误原因:", e
	def main(self):
		f_handler = open('out.log', 'w')
		sys.stdout = f_handler
		page = open('page.txt', 'r')
		content = page.readline()
		start_page = int(content.strip()) - 1
		page.close()
		print self.getCurrentTime(),"start page is ", start_page
		print self.getCurrentTime(),"spider running"
		self.total_num = self.getTotalPageNum()
		print self.getCurrentTime(), "get total index page num", self.total_num
		if not start_page:
			start_page = self.total_num
		for x in range(1, start_page):
			print self.getCurrentTime(), "parsing No.", start_page - x + 1, "'s page"
			try:
				self.getQuestions(start_page - x + 1)
			except urllib2.URLError, e:
				if hasattr(e, "reason"):
					print self.getCurrentTime(),"parse this page failed, reason:", e.reason
			except Exception,e:
				print self.getCurrentTime,"parse this page failed, reason:", e