def collectUrls(self, tag, start, orderType, allNeeded): # generate the startPage startPage = "http://book.douban.com/tag/" + tag + "?start=" + str(start) + "&type=" + orderType # generate needed pages hasTail = allNeeded%20 if hasTail>0: pages = allNeeded/20 + 1 else: pages = allNeeded/20 # init the counter and the collectors sid = 0 collectCount = 0 bookids = [] # collect it html_downloader = HtmlDownloader(30) failCount = 0 for page in range(1, pages+1): if page == 1: url = startPage else: url = "http://book.douban.com" + nextPage failFlag = 0 retryCount = 0 while failFlag == 0: print "@ downloading page # " + str(page) html = html_downloader.download(url) if html: failFlag = 1 else: if retryCount >= 2: print "!!! ignore page " + page + "!" failCount += 1 break print "! download error! retrying!" html = html_downloader.download(url) retryCount += 1 # pause print "# pause for 1 second!" time.sleep(1) if html: p = re.compile(r'<script.*?</script>', re.S) html = re.sub(p, "", html) soup = BeautifulSoup(html, "lxml") books = soup.find_all(class_='subject-item') for book in books: sid += 1 if sid <= allNeeded: url = book.find_all('h2')[0].a['href'] if url != "": collectCount += 1 bookid = re.split(r'/',url)[-2] bookids.append(bookid) nextPage = soup.select("span.next a")[0]["href"] else: print "!!! fail too many times, exit!" break print "@ get " + str(collectCount) + " bookids in " + str(sid) + " items!" if collectCount < allNeeded: print "@ found less than you needed!" return bookids
def export(self, bookids, tag): if len(bookids) == 0: print "found no id, no data downloaded!" else: sid = 0 html_downloader = HtmlDownloader(30) ticket = 50 failBooks = [] successCount = 0 failCount = 0 for bookid in bookids: sid += 1 bookapi = "https://api.douban.com/v2/book/" + bookid failFlag = 0 retryCount = 0 while failFlag == 0: bookinfo = html_downloader.download(bookapi) if bookinfo: failFlag = 1 else: if retryCount >= 2: print "!!! ignore # " + str(sid) + " book: " + bookid + "!" break else: print "! download error! retrying!" html_downloader = HtmlDownloader(30) ticket = 50 retryCount += 1 ticket -= 1 print "# pause for 1 second!" time.sleep(1) while ticket == 0: print "@ to prevent blocked, generate a new downloader!" html_downloader = HtmlDownloader(30) ticket = 50 if bookinfo: print "+ book #" + str(sid) + " download ok!" bookinfo = unicode(bookinfo, "utf-8") bookinfo = json.loads(bookinfo) dbid = bookinfo["id"] title = bookinfo["title"] subtitle = bookinfo["subtitle"] origin_title = bookinfo["origin_title"] alt_title = bookinfo["alt_title"] author = bookinfo["author"] translator = bookinfo["translator"] publisher = bookinfo["publisher"] pubdate = bookinfo["pubdate"] isbn10 = bookinfo["isbn10"] isbn13 = bookinfo["isbn13"] catalog = bookinfo["catalog"] author_intro = bookinfo["author_intro"] summary = bookinfo["summary"] pages = bookinfo["pages"] binding = bookinfo["binding"] price = bookinfo["price"] numRaters = bookinfo["rating"]["numRaters"] average = bookinfo["rating"]["average"] tds = [sid, dbid, title, subtitle, origin_title, alt_title, author, translator, publisher, pubdate, isbn10, isbn13, catalog, author_intro, summary, pages, binding, price, numRaters, average] tdCount = len(tds) for i in range(0, tdCount): self.table.write(sid, i, tds[i]) print "+ line #" + str(sid) + " write ok!" successCount += 1 else: print "! book #" + str(sid) + " download error!" self.table.write(sid, 0, sid) self.table.write(sid, 1, bookid) print "! line #" + str(sid) + " write error!" failCount += 1 filename = tag + ".xls" self.xls.save(filename) # tip = "@ 下载数据已成功保存为 " + filename + " 文件!" # tip = unicode(tip, "utf-8").encode("gbk") tip = "@ " + str(successCount) + " successfully downloaded, " + str(failCount) + " failed, file saved successfully!" print tip