def get_chapter(url): print("Processing: " + url) html = gsweb.get_soup(url) if 'wuxiaworld.com' in url: chapter_title, contents = get_wuxiaworld_com(html) elif 'wuxiaworld.co' in url: chapter_title, contents = get_wuxiaworld_co(html) elif 'stabbingwithasyringe' in url: chapter_title, contents = get_syringe(html) else: raise SystemExit('Something went wrong! Unsuported server!') # Novel dependant cleanup try: print('Cleaning...') novel = __import__(novel_module) contents = novel.clean(contents) print('Clean') except ImportError: pass soup_str = "".join(map(str, contents)) # Before turning the html into a soup, replace all weird chinese spaces # with actual spaces. soup_str = soup_str.replace(' ', ' ') # And replace double br tags with a paragraph break soup_str = re.sub(r'</br>', '', soup_str) soup_str = re.sub(r'<br/>[\t\n\r\f\v\s ]*<br/>', '\n<p>', soup_str) soup_str = re.sub(r'<br/>', '</p>\n<p>', soup_str) print(chapter_title) chapter_file = clean_chapter_name(chapter_title) print(chapter_file) # Then turn the string back into a soup soup_text = BeautifulSoup(soup_str, 'lxml') # Remove all atributes from all tags for tag in soup_text.findAll(True): tag.attrs = {} # Remove empty paragraphs, including those which only contain br tags or # the weird space character (why the &·$% do you have a paragraph with # nothing?) for paragraph in soup_text.findAll(['span', 'p']): if not paragraph.text or paragraph.text in [' ', '。']: paragraph.decompose() # Remove stray br tags # for br_tag in soup_text.findAll('br'): # br_tag.decompose() # Turn the soup into text # text = str(soup_text) text = soup_text.prettify() # Undo some ridiculous censoring # text = damnit.sub('damn it', text) # text = damned.sub('damned', text) # text = f**k.sub('f**k', text) return chapter_title, chapter_file, text
def get_chapter(url): global chapterCount chapterCount = chapterCount + 1 pagehtml = gsweb.get_soup(url) print("Current url: " + url) pages_re = re.compile('"pages":([0-9]*),', re.IGNORECASE) pages = int(pages_re.search(str(pagehtml)).group(1)) print("Pages in this chapter: {}".format(pages)) text = [] chaptertitle = pagehtml.select('h1.h2')[0].get_text().strip() chapterfile = "{}.xhtml".format( chaptertitle.replace(" ", "-") + "-" + str(chapterCount)) text.append("<h2>{}</h2>\n".format(chaptertitle)) for i in range(1, pages + 1): page_url = url + "/page/" + str(i) print("Working on: " + page_url) text.append('<div class="page">\n') for j in get_page(page_url): text.append(j.prettify()) text.append('</div>\n') chapter = "".join(text) return chaptertitle, chapterfile, chapter
def genlist(start, end): global origin list_page = gsweb.get_soup(origin) chapterlist = [] for i in range(start, end + 1): # print(i) if i in [29, 115, 342, 825, 1183, 1794]: continue elif i in range(1, 572 + 1): text = '^Chapter %s .*' % str(i) if i == 370: text = '^Chapter %s$' % str(i) elif i in [351, 353, 354]: text = '^Chapter %s - ' % str(i) elif i in [ 573, ]: text = '^AST: Chapter %s .*' % str(i) elif i in [584, 585, 586, 587, 588, 589, 605, 616]: text = '^AST: Chapter %s!$' % str(i) elif i in [ 590, 800, ]: text = '^chapter %s$' % str(i) elif i in [596, 598, 799, 1416] + list(range(1440, 2492 + 1)): text = '^AST %s ' % str(i) if i in [1797, 1957, 2281]: text = '^AST %s- ' % str(i) elif i == 2345: text = '^AST 2345 - Fifth .*' elif i == 2435: text = '^AST 2345 - Tyrannous .*' elif i in [2468, 2473]: text = '^Chapter %s - ' % str(i) elif i in [597, 600, 603, 606, 609, 610, 611, 613, 614, 615, 617, 619 ] + list(range(591, 595 + 1)) + list(range(623, 626 + 1)): text = '^Chapter %s$' % str(i) elif i in [ 599, 601, 602, 604, 607, 608, 621, 668, 670, 671, 672, 675, 676, 677, 679, 681, 682, 684, 685, 686, 687, 689, 691, ]: text = '^Chapter %s!' % str(i) elif i in [ 612, ]: text = '^Chapter %s .*' % str(i) elif i in [618, 620, 622, 627, 631, 633, 635, 639, 642, 645, 648, 650]: text = '^AST Chapter: %s!' % str(i) elif i in [654, 658, 661, 663, 666, 669, 674, 678, 680, 683, 688, 690, 693, ] + list(range(697, 798+1)) + \ list(range(801, 1415+1)) + list(range(1417, 1436+1)): text = '^Chapter: %s$' % str(i) if i == 1184: text = '^1184$' elif i == 2493: text = '^Author.*' else: text = '^Chapter %s$' % str(i) link = list_page.find('a', text=re.compile(text)) url = origin + link['href'].split('/')[-1] chapterlist.append(url) return chapterlist
def get_page(text_url): text = gsweb.get_soup(text_url).select_one('pre').findChildren() return text
def get_book(initial_url): base_url = 'http://www.wattpad.com' html = gsweb.get_soup(initial_url) # Get basic book information author = html.select('div.author-info__username')[0].get_text() title = html.select('div.story-info__title')[0].get_text().strip() description = html.select('pre.description-text')[0].get_text() coverurl = html.select('div.story-cover img')[0]['src'] labels = ['Wattpad'] for label in html.select('div.tags a'): if '/' in label['href']: labels.append(label.get_text()) if debug: print("Author: " + author) print("Title: " + title) print("Description: " + description) print("Cover: " + coverurl) print("Labels:" + " ".join(labels)) print("'{}' by {}".format(title, author).encode("utf-8")) # print(next_page_url) # Get list of chapters chapterlist = list(dict.fromkeys(html.select('.story-parts ul li a'))) # Remove from the file name those characters that Microsoft does NOT allow. # This also affects the FAT filesystem used on most phone/tablet sdcards # and other devices used to read epub files. # Disallowed characters: \/:*?"<>|^ filename = title for i in ['\\', '/', ':', '*', '?', '"', '<', '>', '|', '^']: if i in filename: filename = filename.replace(i, '') # Apple products disallow files starting with dot filename = filename.lstrip('.') epubfile = "{} - {}.epub".format(filename, author) if not os.path.exists(epubfile): identifier = "wattpad.com//%s/%s" % (initial_url.split('/')[-1], len(chapterlist)) LANGUAGE = 'en' book = MyBook(identifier, title, LANGUAGE, 'wattpad2epub') book.add_author(author) # Add all labels. book.add_labels(labels) # Add a cover if it's available cover_file = 'cover.jpg' if get_cover(coverurl, cover_file): book.add_cover(cover_file) os.remove(cover_file) # Define CSS style with open(os.path.join(PROG_DIR, "CSS", "nav.css")) as style_nav: book.add_nav_style(style_nav.read()) with open(os.path.join(PROG_DIR, "CSS", "body.css")) as style_body: book.add_body_style(style_body.read()) # Introduction book.add_intro(author, initial_url, description, os.path.join(PROG_DIR, "HTML", "intro.xhtml")) for item in chapterlist: chaptertitle = item.get_text().strip().replace("/", "-") if chaptertitle.upper() != "A-N": print("Working on: {}".format(chaptertitle).encode("utf-8")) ch_title, ch_file, ch_text = get_chapter("{}{}".format( base_url, item['href'])) book.add_chapter(chaptertitle, ch_file, LANGUAGE, ch_text) # Define Table of Contents, NCX, Nav and book spine book.finalize() # Write the epub to file book.write(epubfile) else: print("Epub file already exists, not updating")
def get_html(url): return gsweb.get_soup(url)