示例#1
0
 def get_chapter_content(self, index, url):
     _url = url
     try:
         bookdirpath = os.path.join(self.path, self.bookname)
         file_name = '%05d' % (index + 1)
         file_name = 'chapter_' + file_name + '.xhtml'
         folder = os.path.exists(os.path.join(bookdirpath, file_name))
         if folder:
             self.mutex.acquire()
             self.num += 1
             percent = self.num * 100.0 / len(self.chapters)
             _str = '%s [%.2f%%] (%d/%d) %d 已存在!' % (
                 self.bookname, percent, self.num, len(
                     self.chapters), index)
             # _str = '%s [%.2f%%] %s 已存在!' % (self.bookname, percent, self.chapters[index]["title"])
             print('\r%s' % _str, )
             sys.stdout.flush()
             self.mutex.release()
             return
         if self.settings['page']['link_concat']:
             _url = self.settings['home'] + url
         html = Spider.get_content(_url)
         if self.settings['chapter']['gzip']:
             html = zlib.decompress(html, zlib.MAX_WBITS | 16)
         html = html.decode(self.settings['decode'], 'ignore')
     except Exception as e:
         self.mutex.acquire()
         # print '\r%s %s ' % (_url, e.message),
         print('%s %s' % (_url, str(e)))
         sys.stdout.flush()
         self.mutex.release()
         time.sleep(1)
         self.get_chapter_content(index, url)
         return
     html = html.replace('xmlns="http://www.w3.org/1999/xhtml" /',
                         '').replace('xmlns="http://www.w3.org/1999/xhtml"',
                                     '')
     doc = pq(html)
     if self.settings['chapter']['rm_eles']:
         for cur in self.settings['chapter']['rm_eles']:
             doc(cur).remove()
     self.create_chapter(index,
                         doc(self.settings['chapter']['content']).html())
示例#2
0
 def get_html(self):
     html = Spider.get_content(self.url).decode('utf-8', 'ignore')
     with open('file/weather.html', mode='w', encoding='utf-8') as f:
         f.write(html)
     doc = pq(html)