def fetch_contents(book, soup): host = host_of(soup) div = find_tag(soup, "div", "list_box") for a in div.find_all("a"): text = yem.Text.for_html(host + a["href"], fetch_text) book.append(yem.Chapter(text=text, title=a.string.strip())) def fetch_text(url): try: soup = fetch_html(url, encoding=ENCODING) div = find_tag(soup, "div", "box_box") lines = [] for tag in div: if tag.name is None: s = tag.string.strip() if len(s) != 0: lines.append(s) lines.pop() return "\n".join(lines) except: return "" if __name__ == "__main__": url = "http://234zw.com/xingjiqiyuan/" book = yem.Book() soup = fetch_attributes(book, url) fetch_contents(book, soup) yem.make_book(book, r"E:\tmp")
def fetch_contents(book, soup): host = host_of(soup) for dd in soup.find_all('dd'): a = dd.next chapter = yem.Chapter(title=re.sub(r'\s[\d]{2}-[\d]{2}', '', a.string.strip())) chapter.text = yem.Text.for_html(host + a['href'], fetch_text, tag=chapter) book.append(chapter) def fetch_text(url, chapter): try: print('fetching text:', chapter.title) soup = fetch_html(url, encoding=ENCODING) if soup is None: app_error('cannot open url: {0}', url) return '' return yem.LINE_SEPARATOR.join(soup.find('div', id='content').stripped_strings) except: return '' if __name__ == "__main__": url = "http://www.mangg.com/id28111/" book = yem.Book() soup = fetch_attributes(book, url) fetch_contents(book, soup) args = { "pmab.text.encoding": "gb18030" } yem.make_book(book, r"E:\tmp", "pmab", **args)