def get_chapter_content(self, index, url): _url = url try: bookdirpath = os.path.join(self.path, self.bookname) file_name = '%05d' % (index + 1) file_name = 'chapter_' + file_name + '.xhtml' folder = os.path.exists(os.path.join(bookdirpath, file_name)) if folder: self.mutex.acquire() self.num += 1 percent = self.num * 100.0 / len(self.chapters) _str = '%s [%.2f%%] (%d/%d) %d 已存在!' % ( self.bookname, percent, self.num, len( self.chapters), index) # _str = '%s [%.2f%%] %s 已存在!' % (self.bookname, percent, self.chapters[index]["title"]) print('\r%s' % _str, ) sys.stdout.flush() self.mutex.release() return if self.settings['page']['link_concat']: _url = self.settings['home'] + url html = Spider.get_content(_url) if self.settings['chapter']['gzip']: html = zlib.decompress(html, zlib.MAX_WBITS | 16) html = html.decode(self.settings['decode'], 'ignore') except Exception as e: self.mutex.acquire() # print '\r%s %s ' % (_url, e.message), print('%s %s' % (_url, str(e))) sys.stdout.flush() self.mutex.release() time.sleep(1) self.get_chapter_content(index, url) return html = html.replace('xmlns="http://www.w3.org/1999/xhtml" /', '').replace('xmlns="http://www.w3.org/1999/xhtml"', '') doc = pq(html) if self.settings['chapter']['rm_eles']: for cur in self.settings['chapter']['rm_eles']: doc(cur).remove() self.create_chapter(index, doc(self.settings['chapter']['content']).html())
def get_html(self): html = Spider.get_content(self.url).decode('utf-8', 'ignore') with open('file/weather.html', mode='w', encoding='utf-8') as f: f.write(html) doc = pq(html)