def _extract(self): content = self._content p1 = content.find('<div class="b-story-header">') self._intro = '' if p1 < 0: return -2 p1 = content.find('<h1>', p1) if p1 < 0: return -2 p2 = content.find('</h1>', p1) if p2 < 0: return -2 title = content[p1 + 4:p2] self._intro = shell.html2text(title).strip() + '\n\n\n' p1 = content.find('<select name="page">') if p1 >= 0: p2 = content.find('</select>', p1) if p2 < 0: return -1 count = 0 self._index = [] while p1 < p2: p1 = content.find('<option value="', p1) if p1 < 0 or p1 >= p2: break p1 = p1 + 15 pp = content.find('"', p1) if p1 < 0: break text = content[p1:pp].strip() p1 = pp self._index.append(text) else: self._index = ['1'] return 0
def chapter(self, n): content = self.read_chapter(n) p1 = content.find('<div class="b-story-body-x') if p1 < 0: print 'suck1' return None p1 = content.find('>', p1) if p1 < 0: print 'suck2' return None p2 = content.find('</div>', p1) html = content[p1 + 1:p2] return shell.html2text(html)
def __init__(self, url): self._url = url self._content = shell.request_safe(url).decode('gbk', 'ignore') #self._content = open('content.txt', 'r').read().decode('gbk') content = self._content p = re.compile(r'<dd><a\shref="(\d*.html)" title="(.*)"') result = [[0, x[0], x[1]] for x in p.findall(content)] for m in result: m[0] = int(re.findall('\d*', m[1])[0]) result.sort() self._index = [(m[1], m[2]) for m in result] p1 = content.find('<div class="mulu_bookinfo">') p2 = content.find('</div>', p1) intro = '' if p1 >= 0 and p2 >= 0: text = content[p1:p2] intro = shell.html2text(text) + '\n\n' self._intro = intro
def __init__(self, url): self._url = url if 1: self._content = shell.request_safe(url).decode('utf-8', 'ignore') self._content = self._content.encode('gbk', 'ignore') self._content = self._content.decode('gbk', 'ignore') else: self._content = open('mangg.txt', 'r').read().decode('gbk') content = self._content p = re.compile(r'<dd><a href="([^\.]*).html">([^<]*)</a>') result = [[0, x[0], x[1]] for x in p.findall(content)] for m in result: m[0] = int(re.findall('/id\d*/(\d*)', m[1])[0]) result.sort() self._index = [(m[1] + '.html', m[2]) for m in result] intro = '' p1 = content.find('<div id="intro">') p2 = content.find('</div>', p1) if p1 >= 0 and p2 >= 0: intro = content[p1:p2] intro = shell.html2text(intro) + '\n\n' self._intro = intro
def chapter(self, n): content = self.read_chapter(n) p = re.compile(r'<div id="content">(.*)</div>') result = p.findall(content) html = result[0] return shell.html2text(html)