예제 #1
0
	def collectUrls(self, tag, start, orderType, allNeeded):
		# generate the startPage
		startPage = "http://book.douban.com/tag/" + tag + "?start=" + str(start) + "&type=" + orderType
		# generate needed pages
		hasTail = allNeeded%20
		if hasTail>0:
			pages = allNeeded/20 + 1
		else:
			pages = allNeeded/20
		# init the counter and the collectors
		sid = 0
		collectCount = 0
		bookids = []
		# collect it
		html_downloader = HtmlDownloader(30)
		failCount = 0
		for page in range(1, pages+1):
			if page == 1:
				url = startPage
			else:
				url = "http://book.douban.com" + nextPage
			failFlag = 0
			retryCount = 0
			while failFlag == 0:
				print "@ downloading page # " + str(page)
				html = html_downloader.download(url)
				if html:
					failFlag = 1
				else:
					if retryCount >= 2:
						print "!!! ignore page " + page + "!"
						failCount += 1
						break
					print "! download error! retrying!"
					html = html_downloader.download(url)
					retryCount += 1
			# pause
			print "# pause for 1 second!"
			time.sleep(1)
			if html:
				p = re.compile(r'<script.*?</script>', re.S)
				html = re.sub(p, "", html)
				soup = BeautifulSoup(html, "lxml")
				books = soup.find_all(class_='subject-item')
				for book in books:
					sid += 1
					if sid <= allNeeded:
						url = book.find_all('h2')[0].a['href']
						if url != "":
							collectCount += 1
						bookid = re.split(r'/',url)[-2]
						bookids.append(bookid)
				nextPage = soup.select("span.next a")[0]["href"]
			else:
				print "!!! fail too many times, exit!"
				break
		print "@ get " + str(collectCount) + " bookids in " + str(sid) + " items!"
		if collectCount < allNeeded:
			print "@ found less than you needed!"
		return bookids
예제 #2
0
	def export(self, bookids, tag):
		if len(bookids) == 0:
			print "found no id, no data downloaded!"
		else:
			sid = 0
			html_downloader = HtmlDownloader(30)
			ticket = 50
			failBooks = []
			successCount = 0
			failCount = 0
			for bookid in bookids:
				sid += 1
				bookapi = "https://api.douban.com/v2/book/" + bookid
				failFlag = 0
				retryCount = 0
				while failFlag == 0:
					bookinfo = html_downloader.download(bookapi)
					if bookinfo:
						failFlag = 1
					else:
						if retryCount >= 2:
							print "!!! ignore # " + str(sid) + " book: " + bookid + "!"
							break
						else:
							print "! download error! retrying!"
							html_downloader = HtmlDownloader(30)
							ticket = 50
							retryCount += 1
				ticket -= 1
				print "# pause for 1 second!"
				time.sleep(1)
				while ticket == 0:
					print "@ to prevent blocked, generate a new downloader!"
					html_downloader = HtmlDownloader(30)
					ticket = 50
				if bookinfo:
					print "+ book #" + str(sid) + " download ok!"
					bookinfo = unicode(bookinfo, "utf-8")
					bookinfo = json.loads(bookinfo)
					dbid = bookinfo["id"]
					title = bookinfo["title"]
					subtitle = bookinfo["subtitle"]
					origin_title = bookinfo["origin_title"]
					alt_title = bookinfo["alt_title"]
					author = bookinfo["author"]
					translator = bookinfo["translator"]
					publisher = bookinfo["publisher"]
					pubdate = bookinfo["pubdate"]
					isbn10 = bookinfo["isbn10"]
					isbn13 = bookinfo["isbn13"]
					catalog = bookinfo["catalog"]
					author_intro = bookinfo["author_intro"]
					summary = bookinfo["summary"]
					pages = bookinfo["pages"]
					binding = bookinfo["binding"]
					price = bookinfo["price"]
					numRaters = bookinfo["rating"]["numRaters"]
					average = bookinfo["rating"]["average"]
					tds = [sid, dbid, title, subtitle, origin_title, alt_title, author, translator, publisher, pubdate, isbn10, isbn13, catalog, author_intro, summary, pages, binding, price, numRaters, average]
					tdCount = len(tds)
					for i in range(0, tdCount):
						self.table.write(sid, i, tds[i])
					print "+ line #" + str(sid) + " write ok!"
					successCount += 1
				else:
					print "! book #" + str(sid) + " download error!"
					self.table.write(sid, 0, sid)
					self.table.write(sid, 1, bookid)
					print "! line #" + str(sid) + " write error!"
					failCount += 1
			filename = tag + ".xls"
			self.xls.save(filename)
			# tip = "@ 下载数据已成功保存为 " + filename + " 文件!"
			# tip = unicode(tip, "utf-8").encode("gbk")
			tip = "@ " + str(successCount) + " successfully downloaded, " + str(failCount) + " failed, file saved successfully!"
			print tip