def delete_xhtml_attributes(bk, attributes: dict, prefs: MutableMapping) -> None: for xhtml_id, xhtml_href in bk.text_iter(): if prefs['parse_only_selected_files'] and xhtml_href not in prefs[ 'selected_files']: continue soup = gumbo_bs4.parse(bk.readfile(xhtml_id)) for elem in soup.find_all(True): try: if elem['id'] in attributes['ids']: del elem['id'] except KeyError: pass classes = elem.get('class', []) if isinstance(classes, str): classes = [classes] for class_ in classes.copy(): if class_ in attributes['classes']: try: elem['class'].remove(class_) except AttributeError: del elem['class'] # this should never raise a KeyError # I don't know if it's linked to python, sigil, beautifulsoup or gumbo versions: # with some installation the elements keep empty class attributes. try: if not classes: del elem['class'] except KeyError: pass bk.writefile(xhtml_id, soup.serialize_xhtml())
def remove_mo_attributes(self, data, remove_class=True, remove_id=True): """ Remove MO attributes to tags in the given XHTML file, and return the resulting XHTML string. :param data: the source code :type data: str :param remove_class: remove the MO class attribute :type remove_class: bool :param remove_id: remove the MO id attribute :type remove_id: bool """ msgs = [] if (not remove_class) and (not remove_id): return (msgs, data) import sigil_gumbo_bs4_adapter as gumbo_bs4 soup = gumbo_bs4.parse(data) for node in soup.find_all(): if node.name in self.tags: if self.has_mo_class(node): if remove_class: self.remove_mo_class(node) msgs.append(("INFO", "removed class 'mo' from element '%s'" % (node.name))) if remove_id: if (self.existing_ids_only) and (self.has_mo_id(node)): msgs.append(("WARN", "element '%s' with MO id '%s' => not removing" % (node.name, node.attrs["id"]))) elif self.has_mo_id(node): old_id = node.attrs["id"] self.remove_id_attribute(node) msgs.append(("INFO", "removed id '%s' from element '%s'" % (old_id, node.name))) elif self.has_id_not_mo(node): msgs.append(("WARN", "element '%s' with id '%s' => not removing" % (node.name, node.attrs["id"]))) out_data = self.output_xhtml_code(soup) return (msgs, out_data)
def build_html(fragment, css=False): fragment = regex.sub(r'<p([^>]*)></p>', r'<p\1> </p>', fragment) css_link = '' if css: css_link = LINK_TEXT new = HTML.format(css_link, fragment) soup = gumbo_bs4.parse(new) return soup.serialize_xhtml()
def makeEPUB(self): out_enc = find_output_encoding(self.opffile) print('Markup encoded as:', out_enc) ml2html = MobiMLConverter(self.htmlfile, out_enc) xhtmlstr, css, cssname = ml2html.processml() soup = gumbo_bs4.parse(xhtmlstr) xhtmlstr = soup.prettyprint_xhtml() file_open(self.htmlfile, 'wb').write(xhtmlstr.encode('utf-8')) if has_cssutils: sheet = cssutils.parseString(css) cssutils.ser.prefs.indent = 2 * ' ' cssutils.ser.prefs.indentClosingBrace = False cssutils.ser.prefs.omitLastSemicolon = False css = unicode_str(sheet.cssText) file_open(cssname, 'wb').write(css.encode('utf-8')) with file_open(self.opffile, 'r', encoding='utf-8') as fp: newopf = '' for line in fp: if line.startswith('<item'): if line.find('text/x-oeb1-document'): line = line.replace('text/x-oeb1-document', 'application/xhtml+xml') if line.find('text/html'): line = line.replace('text/html', 'application/xhtml+xml') newopf += line if line.startswith('<manifest>'): newopf += '<item id="css_file" media-type="text/css" href="styles.css" />\n' file_open(self.opffile, 'wb').write(newopf.encode('utf-8')) outzip = zipfile.ZipFile(self.epubname, 'w') # add the mimetype file uncompressed mimetype = 'application/epub+zip' fileout = os.path.join(self.outdir, 'mimetype') file_open(fileout, 'wb').write(mimetype.encode('utf-8')) nzinfo = ZipInfo('mimetype', compress_type=zipfile.ZIP_STORED) outzip.writestr(nzinfo, mimetype) self.zipUpDir(outzip, self.outdir, 'META-INF') if os.path.exists(os.path.join(self.outdir, 'Images')): self.removeThumbnailImage(os.path.join(self.outdir, 'Images')) self.zipUpDir(outzip, self.outdir, 'Images') outzip.write(self.htmlfile, os.path.basename(self.htmlfile), zipfile.ZIP_DEFLATED) outzip.write(self.opffile, os.path.basename(self.opffile), zipfile.ZIP_DEFLATED) outzip.write(cssname, 'styles.css', zipfile.ZIP_DEFLATED) if os.path.exists(os.path.join(self.outdir, 'toc.ncx')): outzip.write(os.path.join(self.outdir, 'toc.ncx'), 'toc.ncx', zipfile.ZIP_DEFLATED) outzip.close() return self.epubname
def create_dummy_smil_file(self, t_href, t_mid, a_href, smil_mid): """ This function is not currently used. """ import sigil_gumbo_bs4_adapter as gumbo_bs4 ret = None xhtml_data = self.bk.readfile(t_mid).encode("utf-8") soup = gumbo_bs4.parse(xhtml_data) attributes = { "class": re.compile(r".*\b" + self.prefs["mo_class"] + r"\b.*"), "id": re.compile(r".*\b" + self.prefs["id_regex"] + r"\b.*") } s_ids = [node.attrs["id"] for node in soup.find_all(attrs=attributes)] if len(s_ids) > 0: s_name = self.smil_name_from_t_href(t_href) s_href = os.path.join(self.SMIL_DIRECTORY, s_name) mid = self.bk.href_to_id(s_href) if mid is not None: print("INFO: file '%s' exists, removing it" % (s_href)) self.bk.deletefile(mid) i = 1 data = [] data.append(self.SMIL_HEADER % (t_href)) for s_id in s_ids: p_id = "%06d" % (i) i += 1 data.append(self.SMIL_ROW % (p_id, t_href, s_id, "0.000", "0.000", a_href)) data.append(self.SMIL_FOOTER) data = ("\n".join(data)).encode("utf-8") self.bk.addfile(smil_mid, s_name, data, mime="application/smil+xml", properties=None) print("INFO: created file '%s'" % (s_href)) ret = s_href else: print("ERROR: no SMIL elements in file '%s'" % (t_href)) ret = None print() return ret
def add_mo_attributes(self, data): """ Add MO attributes to tags in the given XHTML file, and return the resulting XHTML string. :param data: the source code :type data: str :rtype: str """ import sigil_gumbo_bs4_adapter as gumbo_bs4 msgs = [] soup = gumbo_bs4.parse(data) i = 1 for node in soup.find_all(): if node.name in self.tags: new_id = self.id_format % (i) i += 1 if self.has_nomo_class(node): msgs.append(("WARN", "element '%s' with class 'nomo' => ignoring (it would be '%s')" % (node.name, new_id))) else: add = True if self.existing_ids_only: if self.has_mo_id(node): msgs.append(("INFO", "element '%s' with MO id '%s' => adding class '%s'" % (node.name, node.attrs["id"], self.mo_class))) else: msgs.append(("WARN", "element '%s' without MO id => not adding class '%s'" % (node.name, self.mo_class))) add = False elif self.has_id_not_mo(node): msgs.append(("WARN", "element '%s' with id '%s' => not changing (it would be '%s')" % (node.name, node.attrs["id"], new_id))) else: msgs.append(("INFO", "element '%s' => setting id '%s'" % (node.name, new_id))) node.attrs["id"] = new_id if add: self.add_mo_class(node) out_data = self.output_xhtml_code(soup) return (msgs, out_data)
def processMainText(bk): altReadingCount = 0 def altReadingReplace(matchobj): nonlocal altReadingCount altReadingCount += 1 print('Correcting alternative reading: "%s" | "%s"' % (matchobj.group(1).strip(), matchobj.group(2).strip())) # note: 1 is displayed on top of 2 return '<span style="white-space: nowrap; position: relative;"><span style="position: absolute; font-size: .8em; top: -15px; left: 50%; white-space: nowrap; letter-spacing: normal; color: inherit; font-weight: inherit; font-style: inherit;"><span style="position: relative; left: -50%;">\1</span></span><span style="display: inline-block; color: inherit; letter-spacing: normal; font-size: 1.0em; font-weight: inherit;">\2</span></span>'.replace('\1', matchobj.group(1).strip()).replace('\2', matchobj.group(2).strip()) def altReadingReplaceRuby(matchobj): nonlocal altReadingCount altReadingCount += 1 print('Converting alternative reading: "%s" | "%s"' % (matchobj.group(1).strip(), matchobj.group(2).strip())) # note: 2 is displayed on top of 1 return '<span style="white-space: nowrap; position: relative;"><span style="position: absolute; font-size: .8em; top: -15px; left: 50%; white-space: nowrap; letter-spacing: normal; color: inherit; font-weight: inherit; font-style: inherit;"><span style="position: relative; left: -50%;">\2</span></span><span style="display: inline-block; color: inherit; letter-spacing: normal; font-size: 1.0em; font-weight: inherit;">\1</span></span>'.replace('\1', matchobj.group(1).strip()).replace('\2', matchobj.group(2).strip()) bookTitle = 'Untitled' galleryImages = [] mainText = [] suggestedFilenames = [] for (textID, textHref) in bk.text_iter(): if os.path.split(textHref)[1] in ['Cover.xhtml', 'Section0001.xhtml', 'Illustrations.xhtml']: # main text file is anything but these continue print('\nProcessing text file: %s' % textHref) suggestedFilenames.append('%s[bke_v%s_passed].epub' % (os.path.splitext(os.path.basename(textHref))[0], plugin_version)) html = bk.readfile(textID) # Read the section into html if not isinstance(html, text_type): # If the section is not str then sets its type to 'utf-8' html = text_type(html, 'utf-8') plsWriteBack = False # unwrap heading from <h1><span id='blabl'>text</span></h1> into <h1 id='blal'>text<h1>. # class="mw-headline" part are all removed by ebook converter html = re.sub('<h(\\d)><span id="(.+?)">(.+?)</span></h(\\d)>', '<h\\1 id="\\2">\\3</h\\4>', html) soup = gumbo_bs4.parse(html) # remove lang="en" attribute from <html> tag (FlightCrew complains) for htmlTag in soup.find_all('html'): if htmlTag.get('lang') != None: del htmlTag['lang'] plsWriteBack = True if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # move up headings if necessary headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8'] lvToMvUp = 0 for lv in headingLv: tags = soup.find_all(lv) # print('%s = %r' % (lv, tags)) if len(tags) == 0: lvToMvUp += 1 else: break if lvToMvUp > 0: print('Moving headings up %d level(s).' % lvToMvUp) for i in range(lvToMvUp, len(headingLv)): for tag in soup.find_all(headingLv[i]): tag.name = headingLv[i-lvToMvUp] plsWriteBack = True if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # add Id to headings missing it idFixedCount = 0 for headingTag in soup.find_all(headingLv): if not headingTag.get('id'): headingTag['id'] = 'id-' + str(uuid.uuid4()) idFixedCount += 1 if idFixedCount > 0: plsWriteBack = True print('Added ID attribute to %d heading(s).' % idFixedCount) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # convert name attribute into id in <a> tag tagsFixedCount = 0 for anchorTag in soup.find_all(['a']): if anchorTag.has_attr('name'): anchorTag['id'] = anchorTag['name'] del anchorTag['name'] tagsFixedCount += 1 if tagsFixedCount > 0: print('Converted %d `name` attribute into `id` in <a> tag(s).' % tagsFixedCount) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # originally for correcting multiple T/N sections with identical IDs (all starts from 1) in krytykal source # now it corrects ALL duplicated and invalid IDs idCorrected = correctDuplicateOrInvalidID(bk, soup) if idCorrected > 0: print('Corrected %d duplicated/invalid IDs and their corresponding anchors (if any).' % idCorrected) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # strip all formatings from headings as BTE-GEN does headingStrippedCount = 0 for lv in headingLv: for headingTag in soup.find_all(lv): if len(headingTag.find_all('img')) == 0 and (len(headingTag.find_all(True)) > 0 or headingTag.get('style')): headingTag.string = headingTag.get_text() del headingTag['style'] headingStrippedCount += 1 if headingStrippedCount > 0: plsWriteBack = True print('Stripped formatings from %d headings to match BTE-GEN\'s behavior.' % headingStrippedCount) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # handle the invalid usage of <i> tags in HakoMari vol 2 may 2. This is due to a major error in the source page, but it can't be helped. # also stuff here https://baka-tsuki.org/project/index.php?title=User_talk:Dreamer2908 # ref http://www.w3schools.com/html/html_formatting.asp tagsFixedCount = 0 tag2Css = { 'b':'font-weight: bold;', 'strong':'font-weight: bold;', 'i':'font-style: italic;', 'em':'font-style: italic;', 'big':'font-size: large', 'small':'font-size: smaller', 'mark':'background-color: yellow; color: black;', 's':'text-decoration: line-through;', 'strike':'text-decoration: line-through;', 'del':'text-decoration: line-through;', 'ins':'text-decoration: underline;', 'sub':'vertical-align: sub; font-size: smaller;', 'sup':'vertical-align: super; font-size: smaller;', 'u':'text-decoration: underline;', } for iTag in soup.find_all(['b', 'strong', 'i', 'em', 'big', 'small', 'mark', 's', 'strike', 'del', 'ins', 'sub', 'sup', 'u']): illegalChild = iTag.find_all(['p', 'div', 'table', 'blockquote', 'pre', 'caption', 'dl', 'hr', 'section', 'ul', 'ol'] + headingLv) if len(illegalChild) > 0: tagsFixedCount += 1 for child in iTag.children: if type(child) == sigil_bs4.element.NavigableString: # a lot of unwanted `<p><i> </i></p>` line will be created if you wrap everything without checking if str(child).strip() != '': wrapper = child.wrap(soup.new_tag(iTag.name)) wrapper.wrap(soup.new_tag('p')) elif child.name == 'p': for grandChild in child.children: if type(grandChild) == sigil_bs4.element.Tag: if grandChild.name == iTag.name: grandChild.unwrap() # remove italic from italic text else: grandChild.wrap(soup.new_tag(iTag.name)) else: grandChild.wrap(soup.new_tag(iTag.name)) elif child.name not in headingLv: # skip styling headings styleAttr = child.get('style') if styleAttr: child['style'] = tag2Css[iTag.name] + styleAttr else: child['style'] = tag2Css[iTag.name] iTag.unwrap() if tagsFixedCount > 0: print('Fixed %d range of invalid usage of text formatting tags (i/b/u/etc.)' % tagsFixedCount) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # wrap phantom (direct decendant of body) <br>/<span>/<a>, text formatting tags and text in <p> (krytykal/skythewood/imoutolicious source) phantomWrapped = 0 removeMe = [] plsWriteBack = True for child in soup.body.contents: if type(child) == sigil_bs4.element.NavigableString: # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking if str(child).strip() != '': child.wrap(soup.new_tag('p'))['class'] = 'baka_epub_phantom_elements' phantomWrapped += 1 else: child.replace_with('\n') # eliminate blank phantom texts that aren't newline or true white spaces elif type(child) == sigil_bs4.element.Tag: if child.name in ['br', 'a']: child.wrap(soup.new_tag('p'))['class'] = 'baka_epub_phantom_elements' phantomWrapped += 1 elif child.name in ['span', 'b', 'strong', 'i', 'em', 'big', 'small', 'mark', 's', 'strike', 'del', 'ins', 'sub', 'sup', 'u']: # for these, check if they have some contents. remove if no if (len(child.get_text().strip()) > 0 or len(child.find_all(True)) > 0): child.wrap(soup.new_tag('p'))['class'] = 'baka_epub_phantom_elements' else: removeMe.append(child) phantomWrapped += 1 if phantomWrapped > 0: for element in removeMe: element.decompose() print('Wrapped %d phantom <br>/<span>/<a>, text formatting tags and texts in <p>.' % phantomWrapped) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # handle the long deprecated center tags tagsFixedCount = 0 for centerTag in soup.find_all('center'): if centerTag.parent.name == 'p': styleAttr = centerTag.parent.get('style') if styleAttr: centerTag.parent['style'] = 'text-align: center; ' + styleAttr else: centerTag.parent['style'] = 'text-align: center;' centerTag.unwrap() else: centerTag.name = 'div' centerTag['style'] = 'text-align: center;' tagsFixedCount += 1 if tagsFixedCount > 0: plsWriteBack = True print('Converted %d deprecated center tag(s) into a suitable form for ePub.' % tagsFixedCount) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # handle the deprecated u tags tagsFixedCount = 0 for uTag in soup.find_all('u'): uTag.name = 'span' uTag['style'] = 'text-decoration: underline;' tagsFixedCount += 1 if tagsFixedCount > 0: plsWriteBack = True print('Converted %d deprecated u tag(s) into a suitable form for ePub.' % tagsFixedCount) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # handle the deprecated tag <s> and <strike>. Use <del> instead. tagsFixedCount = 0 for strikeTag in soup.find_all(['s', 'strike']): strikeTag.name = 'del' tagsFixedCount += 1 if tagsFixedCount > 0: plsWriteBack = True print('Converted %d deprecated <s> and <strike> tag(s) into <del> tag(s).' % tagsFixedCount) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # remove all data-* attributes from tags tagsFixedCount = 0 for buggyTag in soup.find_all(True): attrDel = 0 for attr in list(buggyTag.attrs.keys()): if attr.startswith('data-'): del buggyTag[attr] attrDel += 1 elif attr == 'itemprop': del buggyTag[attr] attrDel += 1 elif attr == 'target': del buggyTag[attr] attrDel += 1 if attrDel > 0: tagsFixedCount += 1 if tagsFixedCount > 0: plsWriteBack = True print('Removed itemprop/data-*/target attribute(s) from %d tag(s).' % tagsFixedCount) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # handle align attribute in p, div, span tagsFixedCount = 0 for pdivspanTag in soup.find_all(['p', 'div', 'span', 'caption', 'img', 'table'] + headingLv): alignAttr = pdivspanTag.get('align') if alignAttr != None: styleAttr = pdivspanTag.get('style') if styleAttr: pdivspanTag['style'] = 'text-align: %s; ' % alignAttr + styleAttr else: pdivspanTag['style'] = 'text-align: %s;' % alignAttr del pdivspanTag['align'] tagsFixedCount += 1 if tagsFixedCount > 0: print('Converted align attribute in %d p/div/span tag(s) into css style.' % tagsFixedCount) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # remove align/noshade/size/width attributes from <hr> tags tagsFixedCount = 0 for buggyTag in soup.find_all('hr'): attrDel = 0 for attr in list(buggyTag.attrs.keys()): if attr in ['align', 'noshade', 'size', 'width']: del buggyTag[attr] attrDel += 1 if attrDel > 0: tagsFixedCount += 1 if tagsFixedCount > 0: print('Removed all deprecated attributes from %d <hr> tag(s).' % tagsFixedCount) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # remove all but global attribute from br tag # event attributes are allowed, but there's no point in such attributes in epub globalAttributes = ['accesskey', 'class', 'contenteditable', 'contextmenu', 'dir', 'draggable', 'dropzone', 'hidden', 'id', 'lang', 'spellcheck', 'style', 'tabindex', 'title', 'translate'] tagsFixedCount = 0 for buggyTag in soup.find_all('br'): attrDel = 0 for attr in list(buggyTag.attrs.keys()): if attr not in globalAttributes: del buggyTag[attr] attrDel += 1 if attrDel > 0: tagsFixedCount += 1 if tagsFixedCount > 0: print('Removed all invalid attributes from %d <br> tag(s).' % tagsFixedCount) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # apply that certain customization to Baka-Tsuki's alternative reading style altReadingCustomized = 0 for spanTag in soup.find_all('span'): styleAttr = spanTag.get('style') if (styleAttr and (styleAttr.replace(' ', '').startswith("position: absolute; font-size: .8em; top: -11px;".replace(' ', '')))): spanTag['style'] = styleAttr.replace('-11px', '-15px') altReadingCustomized += 1 if altReadingCustomized > 0: plsWriteBack = True print('Customized Baka-Tsuki\'s style in %d alternative reading(s).' % altReadingCustomized) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # remove all "Status: Incomplete" messages # signatures: # + <div style="width:25%; border:10px solid white; clear:both; float:right; text-align:center;"> # + <b>Status: Incomplete</b> # + <div style="clear:both; {{#ifeq: yes | yes | margin:auto; text-align:center;"> removeMe = [] for divTag in soup.find_all('div'): hasWidth25percent = False hasStatusIncompleteMsg = False hasFaultyCssStyle = False styleAttr = divTag.get('style') if styleAttr and ('width:25%;' in re.sub('\s', '', styleAttr)): hasWidth25percent = True bTags = divTag.find_all('b') subDivTags = divTag.find_all('div') for bTag in bTags: if bTag.get_text().strip() == 'Status: Incomplete': hasStatusIncompleteMsg = True break for subDivTag in subDivTags: styleAttr = subDivTag.get('style') if (styleAttr and ('{{#ifeq: yes | yes | margin:auto;' in styleAttr)): hasFaultyCssStyle = True break if hasWidth25percent and hasStatusIncompleteMsg and hasFaultyCssStyle: removeMe.append(divTag) if len(removeMe) > 0: plsWriteBack = True for garbage in removeMe: # print(garbage) garbage.decompose() print('Removed %d "Status: Incomplete" message(s).' % len(removeMe)) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # fix the invalid css code in the "Status: Incomplete" message invalidCssCodeFixed = 0 for divTag in soup.find_all('div'): styleAttr = divTag.get('style') if (styleAttr and ('{{#ifeq: yes | yes | margin:auto;' in styleAttr)): divTag['style'] = styleAttr.replace('{{#ifeq: yes | yes | margin:auto;', '/*! {{#ifeq: yes | yes | margin:auto; */') invalidCssCodeFixed += 1 if invalidCssCodeFixed > 0: plsWriteBack = True print('Removed invalid CSS code in %d "Status: Incomplete" message(s).' % invalidCssCodeFixed) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # remove the navigator at the end. How to detect: the last table, containing all baka-tsuki.org links. An automatic and simple navigator should contain only a single table. A customized navigator might contain several nested tables. Kill the biggest one together with everything inside. allTables = soup.find_all('table') if len(allTables) > 0: tableTag = allTables[-1] if tableTag: for tmpTag in tableTag.parents: # reach the highest level of table if tmpTag is not None and tmpTag.name == 'table': tableTag = tmpTag # print(tableTag) allATag = tableTag.find_all('a') if len(allATag) > 0: # table with no link doesn't count allBtLink = True for aTag in allATag: href = aTag.get('href') # print(href) if (href is not None) and ('baka-tsuki.org' not in href) and (not href.startswith('javascript:')): # href can be js link to collapse/expand allBtLink = False else: allBtLink = False if allBtLink: print('Removed the unwanted navigator (table of links to main page and other volumes) at the end of main text.') tableTag.decompose() if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # search for gallery images first for imgTag in soup.find_all('img'): imgSrc = urllib.parse.unquote(imgTag.get('src')) imgAlt = imgTag.get('alt') imgName = os.path.split(imgSrc)[1] if imgAlt and imgAlt.startswith('__galleryimage__'): # print('Found gallery image: %s' % imgName) imgInGallery = [ _[0] for _ in galleryImages ] if imgSrc not in imgInGallery: galleryImages.append((imgSrc, imgAlt)) # still remove it from the text even if it's a duplicate outerTag = imgTag.parent imgTag.decompose() if len(outerTag.contents) == 0: outerTag.decompose() if len(galleryImages) > 0: plsWriteBack = True print('Found %d gallery images: %r.' % (len(galleryImages), [ _[0] for _ in galleryImages ])) plsRemoveEverythingAboveGallery = True for divTag in soup.find_all('div'): # eliminate gallary div and everything before it divID = divTag.get('id') # note that there can be multiple galleries if divID != None and divID.startswith('__gallery__'): if plsRemoveEverythingAboveGallery: aboveTheGallery = divTag.find_previous_siblings() if (len(aboveTheGallery) < 6): for tmpTag in aboveTheGallery: tmpTag.decompose() print('Cleaned stuff above gallery #%s.' % divID) else: print('Too much stuff above gallery #%s. Not gonna clean. Contents even before the gallery?' % divID) divTag.decompose() if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # wrapping img in svg imgWrappedInSvg = 0 outOfGalleryImages = [] print('Processing images in body text...') for imgTag in soup.find_all('img'): imgSrc = urllib.parse.unquote(imgTag.get('src')) imgWidth = imgTag.get('width') imgHeight = imgTag.get('height') imgName = os.path.split(imgSrc)[1] # remove the img from gallery if it's used in the body for tmp in galleryImages: tmpsrc, tmpalt = tmp if tmpsrc == imgSrc: outOfGalleryImages.append(tmp) galleryImages.remove(tmp) if imgTag.parent.name in headingLv: print('Skipped processing heading image: %s' % imgName) continue print('Processing image: %s' % imgName) if imgSrc.startswith('../'): imgSrc = imgSrc[3:] imgID = bk.href_to_id(imgSrc) if imgID: # image file exists svgNode = gumbo_bs4.parse(getSvgForImage(bk, imgID, dispWidth=imgWidth, dispHeight=imgHeight)) # Deal with anchor wrapping around the original img tag # usually <p><a href="http://somewhere.com"><img src='blabla.jpg' alt='nothing' /></a></p> # copy <a> to inside <div>, outside <img> or <svg>. put svgNode outside <a> (and <p> if any) # if <a> contains nothing but the image, kill the original <a> if imgTag.parent.name == 'a': anchorTag = imgTag.parent targetHref = anchorTag.get('href') if targetHref: newATag = soup.new_tag('a') newATag['href'] = targetHref for tmpTag in svgNode.find_all(['svg', 'img']): tmpTag.wrap(newATag) imgTag.parent.insert_before(imgTag) if len(anchorTag.contents) == 0 or (len(anchorTag.contents) == 1 and str(anchorTag.contents[0]).strip() == ''): anchorTag.decompose() # if the parent tag is p, insert svgNode before p and delete img. svg is not allowed inside p or span. if imgTag.parent.name == 'p': imgTag.parent.insert_before(svgNode) outerTag = imgTag.parent imgTag.decompose() if len(outerTag.contents) == 0: outerTag.decompose() elif imgTag.parent.name == 'div' or imgTag.parent.name == 'body': imgTag.replace_with(svgNode) # sometimes img tag is wrapped inside more tag than one p, like b in Heavy Object V11C3P12 # climb the tree until a usable place is found: directly under <body> or <div>, have <div> or <p> or <a> as siblings. # Insert svgNode before it. Decompose the branch if it's worthless else: topBranch = imgTag while not (topBranch.parent.name in ['div', 'body'] or len(topBranch.find_next_siblings(['div', 'p', 'a']) + topBranch.find_previous_siblings(['div', 'p'])) > 0): topBranch = topBranch.parent topBranch.insert_before(svgNode) outerTag = imgTag.parent imgTag.decompose() if len(outerTag.contents) == 0: outerTag.decompose() imgWrappedInSvg += 1 else: print('Error: image file not found.') if imgWrappedInSvg > 0: plsWriteBack = True print('Wrapped %d images in SVG.' % imgWrappedInSvg) if len(outOfGalleryImages) > 0: plsWriteBack = True print('Removed %d images from the gallery because they\'re used in the body text: %r' % (len(outOfGalleryImages), [ _[0] for _ in outOfGalleryImages ])) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # re-add attributes removed by BeautifulSoup for no reason errorsByBsCorrected = 0 for svgTag in soup.find_all('svg'): if 'xmlns' not in svgTag or 'xmlns:xlink' not in svgTag: errorsByBsCorrected += 1 svgTag['xmlns'] = "http://www.w3.org/2000/svg" svgTag['xmlns:xlink'] = "http://www.w3.org/1999/xlink" for imageTag in svgTag.find_all('image'): try: imageTag['xlink:href'] = imageTag['href'] del imageTag['href'] except: pass if errorsByBsCorrected > 0: plsWriteBack = True print('Corrected %d errors introduced by BeautifulSoup in svg/image tag.' % errorsByBsCorrected) if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # Clean up blank paragraphs next to headings and images. blankParagraphsToClean = [] for lv in headingLv: for headingTag in soup.find_all(lv): for paragraph in headingTag.find_next_siblings('p'): if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0: blankParagraphsToClean.append(paragraph) else: break for paragraph in headingTag.find_previous_siblings('p'): if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0: blankParagraphsToClean.append(paragraph) else: break for imgTag in soup.find_all('img'): if imgTag.parent.name == 'p': for paragraph in imgTag.parent.find_next_siblings('p'): if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0: blankParagraphsToClean.append(paragraph) else: break for paragraph in imgTag.parent.find_previous_siblings('p'): if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0: blankParagraphsToClean.append(paragraph) else: break for divTag in soup.find_all('div'): for paragraph in divTag.find_next_siblings('p'): if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0: blankParagraphsToClean.append(paragraph) else: break for paragraph in divTag.find_previous_siblings('p'): if paragraph.get_text().strip() == '' and len(paragraph.find_all('img')) == 0: blankParagraphsToClean.append(paragraph) else: break if len(divTag.contents) == 0: blankParagraphsToClean.append(divTag) for endTag in soup.body.contents[::-1]: if type(endTag) == sigil_bs4.element.Tag: if endTag.name == 'p' and endTag.get_text().strip() == '' and len(endTag.find_all('img')) == 0: blankParagraphsToClean.append(endTag) else: break for startTag in soup.body.contents: if type(startTag) == sigil_bs4.element.Tag: if startTag.name == 'p' and startTag.get_text().strip() == '' and len(startTag.find_all('img')) == 0: blankParagraphsToClean.append(startTag) else: break if len(blankParagraphsToClean) > 0: # print(blankParagraphsToClean) blankParagraphsToClean = removeDuplicateBs4Object(blankParagraphsToClean) for paragraph in blankParagraphsToClean: paragraph.decompose() print('Cleaned %d blank paragraphs next to headings and images.' % len(blankParagraphsToClean)) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # remove trash in head for styleTag in soup.find_all(['style', 'script', 'link', 'iframe']): styleTag.decompose() plsWriteBack = True if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False print('Removed embedded style/script/iframe garbages.') for metaTag in soup.head.find_all('meta'): if (metaTag.get('charset') != None): print('Removing meta charset in head.') metaTag.decompose() plsWriteBack = True if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # link stylesheets cssList = ['../Styles/page_styles.css', '../Styles/stylesheet.css'] for linkTag in soup.head.find_all('link'): if (linkTag.get('rel') == 'stylesheet'): href = linkTag.get('href') if (href in cssList): cssList.remove(href) print('Stylesheet %s already linked.' % href) for css in cssList: cssLinkTag = soup.new_tag('link', href=css, rel="stylesheet", type="text/css") soup.head.append(cssLinkTag) print('Linked stylesheet %s.' % css) plsWriteBack = True if plsWriteBack: html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) plsWriteBack = False # Sigil's prettifying function tends to add needless spaces in the midle of text - tag border # if the html has been already prettified by BeautifulSoup # It's better to not prettify it here html = soup.serialize_xhtml() # handle alternative readings which have been stripped by the ebook convert script html = re.sub('<span>\s*?<span>\s*?<span>(.*?)</span>\s*?</span>\s*?<span>(.*?)</span>\s*?</span>', altReadingReplace, html, flags=re.DOTALL) if altReadingCount > 0: print('Corrected %d alternative readings.' % altReadingCount) plsWriteBack = True # convert ruby tags in yukkuri-literature-service into baka-tsuki-like alternative reading # <ruby>Court Magician<rp>(</rp><rt>Civil Servant</rt><rp>)</rp></ruby> altReadingCount = 0 html = re.sub('<ruby>(.*?)\s*<rp>\s*\(\s*</rp>\s*<rt>(.*?)</rt>\s*<rp>\s*\)\s*</rp>\s*</ruby>', altReadingReplaceRuby, html, flags=re.DOTALL) if altReadingCount > 0: print('Converted %d ruby furigana into Baka-Tsuki-like alternative reading(s).' % altReadingCount) mainText.append(html) if soup.title.string: bookTitle = soup.title.string.strip() print(' ') return mainText, galleryImages, bookTitle, suggestedFilenames
def generateToC(bk, bookTitle, BookId): print('Generating Table of Contents.') def createNavPointTag(tocSoup, navPointID, playOrder, entryLabel, entrySrc, entryLevel): navPointTag = tocSoup.new_tag('navPoint') navPointTag['id'] = navPointID navPointTag['playOrder'] = playOrder textTag = tocSoup.new_tag('text') textTag.string = entryLabel navLabelTag = tocSoup.new_tag('navLabel') navLabelTag.append(textTag) contentTag = tocSoup.new_tag('content') contentTag['src'] = entrySrc levelTag = tocSoup.new_tag('level') levelTag.string = entryLevel navPointTag.append(navLabelTag) navPointTag.append(contentTag) navPointTag.append(levelTag) return navPointTag tocXml = '<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE ncx PUBLIC "-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd"> <ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1"> <head> <meta name="dtb:uid" content="%s"/> <meta name="dtb:depth" content="2"/> <meta name="dtb:totalPageCount" content="0"/> <meta name="dtb:maxPageNumber" content="0"/> </head> <docTitle> <text>%s</text> </docTitle> <navMap> </navMap> </ncx>' % (BookId, bookTitle) tocSoup = sigil_bs4.BeautifulSoup(tocXml, 'xml') navMap = tocSoup.find('navMap') navID = 0 headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8'] headingLvN = {'h1':1, 'h2':2, 'h3':3, 'h4':4, 'h5':5, 'h6':6, 'h7':7, 'h8':8} lastTocEntry = None for textFileInfo in bk.text_iter(): textID, textHref = textFileInfo html = bk.readfile(textID) # Read the section into html if not isinstance(html, text_type): # If the section is not str then sets its type to 'utf-8' html = text_type(html, 'utf-8') soup = gumbo_bs4.parse(html) entryInThisFile = 0 for headingTag in soup.find_all(headingLv): # all heading in body text files should have been given their id. # If one doesn't have its id, it's not in body text, so just ignore it. # Don't mind text files that don't have any entry. It's not an issue (Sigil's behavior) if headingTag.get('id'): entryLabel = headingTag.get_text() if entryInThisFile == 0: # first entry in the file should point to the beginning of the file (Sigil's behavior) entrySrc = textHref else: entrySrc = textHref + '#' + headingTag.get('id') entryLevel = headingTag.name entryLevelN = headingLvN[entryLevel] navID += 1 navPointID = 'navPoint-%d' % navID playOrder = navID navPointTag = createNavPointTag(tocSoup, navPointID, playOrder, entryLabel, entrySrc, entryLevel) if not lastTocEntry: # first entry navMap.append(navPointTag) else: # climb the tree until you find a nav of higher level, or reach navMap parentCandidate = lastTocEntry parentCandidate_levelN = headingLvN[parentCandidate.find('level').string] while (parentCandidate.name != 'navMap' and (entryLevelN <= parentCandidate_levelN)): parentCandidate = parentCandidate.parent try: parentCandidate_levelN = headingLvN[parentCandidate.find('level').string] except: parentCandidate_levelN = 0 parentCandidate.append(navPointTag) lastTocEntry = navPointTag entryInThisFile += 1 # if no heading found, add a Start entry to the first text file if navID == 0: for textFileInfo in bk.text_iter(): textID, textHref = textFileInfo navID = 1 navPointID = 'navPoint-%d' % navID playOrder = navID entryLabel = 'Start' entrySrc = textHref entryLevel = 'h1' navPointTag = createNavPointTag(tocSoup, navPointID, playOrder, entryLabel, entrySrc, entryLevel) navMap.append(navPointTag) break # remove all level tag. it's only useful for building the tree. it's not supposed to exist in toc for levelTag in navMap.find_all('level'): levelTag.decompose() # also measure toc depth tocDepth = 0 for navPointTag in navMap.find_all('navPoint'): thisDepth = 1 parent = navPointTag.parent while parent.name != 'navMap': thisDepth += 1 parent = parent.parent if thisDepth > tocDepth: tocDepth = thisDepth # set tocdepth for metaTag in tocSoup.find_all('meta'): if metaTag.get('name') == "dtb:depth": metaTag['content'] = str(tocDepth) # print('\n\n\n\n\n') # print(tocSoup) bk.writefile(bk.gettocid(), tocSoup.prettify())
def parse_xhtml(bk, cssparser: CSSParser, css_collector: CSSAttributes, prefs: MutableMapping) -> XHTMLAttributes: """ Parse all the xhtml files in the epub and gather classes, ids and fragment identifiers. Also, gather css classes and ids from <style> elements. """ a = XHTMLAttributes() fragid_container_attrs = prefs[ 'fragid_container_attrs'] or a.fragid_container_attrs for xhtml_id, xhtml_href in bk.text_iter(): filename = utils.href_to_basename(xhtml_href) try: soup = gumbo_bs4.parse(bk.readfile(xhtml_id)) except Exception as E: raise XMLParsingError('Error in {}: {}'.format(filename, E)) if prefs['parse_only_selected_files'] and xhtml_href not in prefs[ 'selected_files']: gather_only_fragid = True else: gather_only_fragid = False for elem in soup.find_all(True): # gather fragment identifiers, if present for attr in fragid_container_attrs: fragid = get_fragid(elem, attr) if fragid: a.fragment_identifier.add(fragid) if gather_only_fragid: continue # tag 'style': gather all css classes and ids if elem.name == 'style': try: style = elem.contents[0] except IndexError: pass else: cssparser.parse_style(style, css_collector, filename) # gather id value, if present try: id_ = elem['id'] except KeyError: pass else: if id_ in a.id_values: try: a.info_id_values[id_][xhtml_href] += 1 except KeyError: a.info_id_values[id_][xhtml_href] = 1 else: a.info_id_values[id_] = {xhtml_href: 1} a.id_values.add(id_) # gather class names and textual value of class attribute, if present classes = elem.get('class', []) if isinstance(classes, str): classes = [classes] for class_ in classes: if class_ in a.class_names: try: a.info_class_names[class_][xhtml_href] += 1 except KeyError: a.info_class_names[class_][xhtml_href] = 1 else: a.info_class_names[class_] = {xhtml_href: 1} a.class_names.add(class_) if classes: try: literal_class_value = re.search(r'class=([\'"])(.+?)\1', str(elem)).group(2) except AttributeError: pass else: a.literal_class_values.add(literal_class_value) a.class_names.discard('') a.literal_class_values.discard('') return a
def cleanUpForTruyenFull(): nonlocal soup headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] headingTags = soup.body.find_all(headingLv) divTags = soup.body.find_all( "div", "chapter-c" ) # WebToEpub note: you need <div class="col-xs-12"> if len(divTags) > 0: textNode = divTags[0].extract() headerNode = None if len(headingTags) > 0: headerNode = headingTags[0].extract() del headerNode["class"] del headerNode["style"] headerNode.string = headerNode.get_text().strip() html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) for node in soup.body.contents: node.extract() if headerNode: soup.body.append(headerNode) soup.body.append(textNode) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) # print(len(soup.body.find_all(['a', 'span', 'p'], { 'style':"color:white;font-size:1px;"}))) # print((soup.body.find_all(lambda tag:tag.has_attr('style') and 'font-size:1px' in tag['style']))) # print(len(soup.body.find_all(['a', 'span', 'p']))) for node in soup.body.find_all(lambda tag: tag.has_attr( 'style') and 'font-size:1px' in tag['style']): node.decompose() html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) textNode = soup.body.find("div", "chapter-c") # unwrapping the div in a preferrable way newTextNode = soup.new_tag('div') newTextNode['class'] = "chapter-c" # removeMe = [] previousP = None for child in textNode.contents: if type(child) == sigil_bs4.element.NavigableString: # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking if str(child).strip() != '': if previousP: previousP.append(copy.copy(child)) else: child = copy.copy(child) newTextNode.append(child) previousP = child.wrap(soup.new_tag('p')) else: newTextNode.append( copy.copy(child)) # yes, copy even blank space elif type(child) == sigil_bs4.element.Tag: if child.name == 'br': previousP = None elif child.name not in tagsNotAllowedInP: # for these, check if they have some contents. skip copying if no if (len(child.get_text().strip()) > 0 or len(child.find_all(True)) > 0 ) or child.has_attr('id') or child.has_attr( 'name'): if previousP: previousP.append(copy.copy(child)) else: child = copy.copy(child) newTextNode.append(child) previousP = child.wrap(soup.new_tag('p')) else: # stuff not allowed in <p> child = copy.copy(child) newTextNode.append(child) textNode.replace_with(newTextNode) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) textNode = soup.body.find("div", "chapter-c") textNode.unwrap()
def reloadSoup(): nonlocal soup, textContents if soup: textContents = soup.serialize_xhtml() soup = gumbo_bs4.parse(textContents)
def run(bk): # get python plugin path global plugin_path plugin_path = os.path.join(bk._w.plugin_dir, plugin_name) for (textID, textHref) in bk.text_iter(): if os.path.split(textHref)[1] in [ 'Cover.xhtml', 'cover.xhtml', 'titlepage.xhtml', 'Section0001.xhtml', 'Illustrations.xhtml' ]: # main text file is anything but these continue print('\nProcessing text file: %s' % textHref) textContents = bk.readfile( textID) # Read the section into textContents if not isinstance( textContents, text_type ): # If the section is not str then sets its type to 'utf-8' textContents = text_type(textContents, 'utf-8') soup = gumbo_bs4.parse(textContents) def reloadSoup(): nonlocal soup, textContents if soup: textContents = soup.serialize_xhtml() soup = gumbo_bs4.parse(textContents) def cleanUpForWordpress(): nonlocal soup # add cleanups for wordpress-based epub # - get <h1> from <header class="entry-header">blablah</header> inside <body> headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] headerNode = soup.body.find("header") if headerNode: headingTags = headerNode.find_all(headingLv) if len(headingTags) > 0: del headingTags[0]["class"] del headingTags[0]["style"] headerNode.replace_with(headingTags[0]) # clean <body> too del soup.body['class'] del soup.body['style'] # - unwrap <div class="entry-content"> # - kill <div class="entry-meta"> divClassUnwrapMe = [ "entry-content", "entry-the-content", "post-entry" ] divClassRemoveMe = [ "entry-meta", "screen-reader-text", "sharedaddy", "wc-comment", "wc-blog-", "comments" ] deleteMe = [] for node in soup.body.find_all('div'): if node.has_attr('class'): if stringContainsAny(node.get('class'), divClassUnwrapMe): node.unwrap() elif stringContainsAny(node.get('class'), divClassRemoveMe): # node.decompose() deleteMe.append(node) for node in deleteMe: node.decompose() # - delete <footer> for node in soup.find_all(['footer']): node.decompose() reloadSoup() def cleanUpForTruyenFull(): nonlocal soup headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] headingTags = soup.body.find_all(headingLv) divTags = soup.body.find_all( "div", "chapter-c" ) # WebToEpub note: you need <div class="col-xs-12"> if len(divTags) > 0: textNode = divTags[0].extract() headerNode = None if len(headingTags) > 0: headerNode = headingTags[0].extract() del headerNode["class"] del headerNode["style"] headerNode.string = headerNode.get_text().strip() html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) for node in soup.body.contents: node.extract() if headerNode: soup.body.append(headerNode) soup.body.append(textNode) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) # print(len(soup.body.find_all(['a', 'span', 'p'], { 'style':"color:white;font-size:1px;"}))) # print((soup.body.find_all(lambda tag:tag.has_attr('style') and 'font-size:1px' in tag['style']))) # print(len(soup.body.find_all(['a', 'span', 'p']))) for node in soup.body.find_all(lambda tag: tag.has_attr( 'style') and 'font-size:1px' in tag['style']): node.decompose() html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) textNode = soup.body.find("div", "chapter-c") # unwrapping the div in a preferrable way newTextNode = soup.new_tag('div') newTextNode['class'] = "chapter-c" # removeMe = [] previousP = None for child in textNode.contents: if type(child) == sigil_bs4.element.NavigableString: # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking if str(child).strip() != '': if previousP: previousP.append(copy.copy(child)) else: child = copy.copy(child) newTextNode.append(child) previousP = child.wrap(soup.new_tag('p')) else: newTextNode.append( copy.copy(child)) # yes, copy even blank space elif type(child) == sigil_bs4.element.Tag: if child.name == 'br': previousP = None elif child.name not in tagsNotAllowedInP: # for these, check if they have some contents. skip copying if no if (len(child.get_text().strip()) > 0 or len(child.find_all(True)) > 0 ) or child.has_attr('id') or child.has_attr( 'name'): if previousP: previousP.append(copy.copy(child)) else: child = copy.copy(child) newTextNode.append(child) previousP = child.wrap(soup.new_tag('p')) else: # stuff not allowed in <p> child = copy.copy(child) newTextNode.append(child) textNode.replace_with(newTextNode) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) textNode = soup.body.find("div", "chapter-c") textNode.unwrap() def splitNodesIntoP(pNodes): nonlocal soup # level 1: # try to split <p>line 1<br/><br/>line 2<img alt='' src='image.jpg'/>line3</p> # into <p>line 1</p> <p>line 2</p> <p><img alt='' src='image.jpg'/></p> <p>line3</p> # Remember to copy style and class. ID goes to the first p # level 2: # into <p>line 1</p> <p></p> <p>line 2</p> <p><img alt='' src='image.jpg'/></p> <p>line3</p> # level x: # try to handle <br/> nested inside something else like <p>line 1 <i>italic text<br/>line 2 in italic</i></p> # into <p>line 1 <i>italic text</i></p> <p><i>line 2 in italic</i></p> # current at level 2, but empty lines are removed at later stage so it doesn't even matter # TODO: copy style, class, id unwrapUsID = [] for textNode in pNodes: # for now, we put all new p in a container (and unwrap it later) newTextNode = soup.new_tag('div') newTextNode_id = 'id-' + str(uuid.uuid4()) newTextNode['id'] = newTextNode_id unwrapUsID.append(newTextNode_id) # removeMe = [] previousP = None lastChildWasBr = False for child in textNode.contents: if type(child) == sigil_bs4.element.NavigableString: lastChildWasBr = False # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking if str(child).strip() != '': if previousP: previousP.append(copy.copy(child)) else: child = copy.copy(child) newTextNode.append(child) previousP = child.wrap(soup.new_tag('p')) else: newTextNode.append( copy.copy(child)) # yes, copy even blank space elif type(child) == sigil_bs4.element.Tag: if child.name == 'br': if lastChildWasBr: newTextNode.append(soup.new_tag('p')) lastChildWasBr = True previousP = None elif child.name == 'img': child = copy.copy(child) newTextNode.append(child) tmpNode = child.wrap(soup.new_tag('div')) tmpNode['class'] = "svg_outer svg_inner" lastChildWasBr = False previousP = None elif child.name not in tagsNotAllowedInP: lastChildWasBr = False # for these, check if they have some contents. skip copying if no if (len(child.get_text().strip()) > 0 or len(child.find_all(True)) > 0 ) or child.has_attr('id') or child.has_attr( 'name'): if previousP: previousP.append(copy.copy(child)) else: child = copy.copy(child) newTextNode.append(child) previousP = child.wrap(soup.new_tag('p')) else: lastChildWasBr = False # stuff not allowed in <p> child = copy.copy(child) newTextNode.append(child) textNode.replace_with(newTextNode) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) for node in soup.body.find_all('div'): if node.get('id') in unwrapUsID: node.unwrap() html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) def splitTagtoP(wantedTag): nonlocal soup pNodes = soup.body.find_all(wantedTag) splitNodesIntoP(pNodes) def splitPtoP(): nonlocal soup splitTagtoP("p") def easyClean1(): nonlocal soup plsWriteBack = False # delete all these nodes for node in soup.find_all(['style', 'meta', 'input', 'button']): node.decompose() plsWriteBack = True # unwrap all these nodes for node in soup.find_all(['font']): node.unwrap() plsWriteBack = True # convert name attribute into id in <a> tag tagsFixedCount = 0 for anchorTag in soup.find_all(['a']): if anchorTag.has_attr('name'): anchorTag['id'] = anchorTag['name'] del anchorTag['name'] tagsFixedCount += 1 if tagsFixedCount > 0: print( 'Converted %d `name` attribute into `id` in <a> tag(s).' % tagsFixedCount) plsWriteBack = True # remove lang, link, vlink attr, mso or calibre class for node in soup.find_all(True): del node['lang'] del node['link'] del node['vlink'] class_attr = node.get('class') if class_attr: try: classes = class_attr.split(' ') except: classes = class_attr new_classes = [] for cl in classes: if not (cl.startswith('Mso') or cl.startswith('mso') or cl.startswith('calibre')): new_classes.append(cl) if len(new_classes) > 0: node['class'] = ' '.join(new_classes) else: del node['class'] plsWriteBack = True if plsWriteBack: reloadSoup() def easyClean2(): nonlocal soup plsWriteBack = False # remove all data-* attributes from tags tagsFixedCount = 0 for buggyTag in soup.find_all(True): attrDel = 0 for attr in list(buggyTag.attrs.keys()): if attr.startswith('data-'): del buggyTag[attr] attrDel += 1 elif attr == 'itemprop': del buggyTag[attr] attrDel += 1 elif attr == 'target': del buggyTag[attr] attrDel += 1 if attrDel > 0: tagsFixedCount += 1 if tagsFixedCount > 0: reloadSoup() print( 'Removed itemprop/data-*/target attribute(s) from %d tag(s).' % tagsFixedCount) # remove align/noshade/size/width attributes from <hr> tags tagsFixedCount = 0 for buggyTag in soup.find_all('hr'): attrDel = 0 for attr in list(buggyTag.attrs.keys()): if attr in ['align', 'noshade', 'size', 'width']: del buggyTag[attr] attrDel += 1 if attrDel > 0: tagsFixedCount += 1 if tagsFixedCount > 0: reloadSoup() print( 'Removed all deprecated attributes from %d <hr> tag(s).' % tagsFixedCount) # handle align attribute in p, div, span tagsFixedCount = 0 for pdivspanTag in soup.find_all(True): alignAttr = pdivspanTag.get('align') if alignAttr != None: styleAttr = pdivspanTag.get('style') if styleAttr: pdivspanTag[ 'style'] = 'text-align: %s; ' % alignAttr + styleAttr else: pdivspanTag['style'] = 'text-align: %s;' % alignAttr del pdivspanTag['align'] tagsFixedCount += 1 if tagsFixedCount > 0: reloadSoup() print( 'Converted align attribute in %d p/div/span tag(s) into css style.' % tagsFixedCount) # remove all links except for stylesheet ones for node in soup.find_all(['link', 'meta']): if not node.get('rel') == "stylesheet": node.decompose() plsWriteBack = True # Ziru’s Musings ads or placeholders for ads for node in soup.findAll( 'div', {'class': lambda x: x and ('ezoic-adpicker-ad' in x.split())}): node.decompose() plsWriteBack = True if plsWriteBack: reloadSoup() def removeAllStyleAttr(): nonlocal soup # hard-core clean up. strip all style # generally should not be used for node in soup.find_all(True): del node['style'] reloadSoup() def removeEmptyStyleAttr(): nonlocal soup plsWriteBack = False for node in soup.find_all(True): if node.has_attr('style'): styleAttr = node['style'].strip() if styleAttr: node['style'] = styleAttr else: del node['style'] plsWriteBack = True if plsWriteBack: reloadSoup() def stripHeaderFormattings(): nonlocal soup # strip all formatings from headings as BTE-GEN does headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] headingStrippedCount = 0 for lv in headingLv: for headingTag in soup.find_all(lv): if len(headingTag.find_all('img')) == 0 and ( len(headingTag.find_all(True)) > 0 or headingTag.get('style')): headingTag.string = headingTag.get_text().strip() del headingTag['style'] headingStrippedCount += 1 if headingStrippedCount > 0: reloadSoup() print( 'Stripped formatings from %d headings to match BTE-GEN\'s behavior.' % headingStrippedCount) def removedNoDisplayDiv(): nonlocal soup # remove all <div style="display:none;"> modifiedTagCount = 0 removeMe = [] for divTag in soup.find_all('div'): if divTag and divTag.get("style") and 'display:none' in re.sub( "\s", "", divTag.get("style")): removeMe.append(divTag) modifiedTagCount += 1 if modifiedTagCount > 0: for divTag in removeMe: divTag.decompose() print('Removed %d <div style="display:none;"> tags.' % modifiedTagCount) reloadSoup() def fixBadIBUusage(): nonlocal soup # handle the invalid usage of <i> tags in HakoMari vol 2 may 2. This is due to a major error in the source page, but it can't be helped. # also stuff here https://baka-tsuki.org/project/index.php?title=User_talk:Dreamer2908 # ref http://www.w3schools.com/html/html_formatting.asp headingLv = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] tagsFixedCount = 0 tag2Css = { 'b': 'font-weight: bold;', 'strong': 'font-weight: bold;', 'i': 'font-style: italic;', 'em': 'font-style: italic;', 'big': 'font-size: large', 'small': 'font-size: smaller', 'mark': 'background-color: yellow; color: black;', 's': 'text-decoration: line-through;', 'strike': 'text-decoration: line-through;', 'del': 'text-decoration: line-through;', 'ins': 'text-decoration: underline;', 'sub': 'vertical-align: sub; font-size: smaller;', 'sup': 'vertical-align: super; font-size: smaller;', 'u': 'text-decoration: underline;', } for iTag in soup.find_all([ 'b', 'strong', 'i', 'em', 'big', 'small', 'mark', 's', 'strike', 'del', 'ins', 'sub', 'sup', 'u' ]): illegalChild = iTag.find_all([ 'p', 'div', 'table', 'blockquote', 'pre', 'caption', 'dl', 'hr', 'section', 'ul', 'ol' ] + headingLv) if len(illegalChild) > 0: tagsFixedCount += 1 for child in iTag.children: if type(child) == sigil_bs4.element.NavigableString: # a lot of unwanted `<p><i> </i></p>` line will be created if you wrap everything without checking if str(child).strip() != '': wrapper = child.wrap(soup.new_tag(iTag.name)) wrapper.wrap(soup.new_tag('p')) elif child.name == 'p': for grandChild in child.children: if type(grandChild) == sigil_bs4.element.Tag: if grandChild.name == iTag.name: grandChild.unwrap( ) # remove italic from italic text else: grandChild.wrap(soup.new_tag( iTag.name)) else: grandChild.wrap(soup.new_tag(iTag.name)) elif child.name not in headingLv: # skip styling headings styleAttr = child.get('style') if styleAttr: child['style'] = tag2Css[iTag.name] + styleAttr else: child['style'] = tag2Css[iTag.name] iTag.unwrap() if tagsFixedCount > 0: reloadSoup() print( 'Fixed %d range of invalid usage of text formatting tags (i/b/u/etc.)' % tagsFixedCount) def convertPossibleDivToP(): nonlocal soup # convert div into p if possible modifiedTagCount = 0 for divTag in soup.find_all('div'): if canBeConvertedIntoP(divTag): divTag.name = 'p' modifiedTagCount += 1 # elif not (divTag.get('style') or divTag.get('id') or divTag.get('class')): # divTag.unwrap() if modifiedTagCount > 0: reloadSoup() print('Converted %d div tags into p.' % modifiedTagCount) def unwarpSingleBigDiv(): nonlocal soup # unwrap the big single div holding all contents bigDivCount = 0 for node in soup.body.contents: if (type(node) == sigil_bs4.element.Tag): if (node.name == 'div'): bigDivCount += 1 else: bigDivCount += 1000 if bigDivCount == 1: soup.body.div.unwrap() reloadSoup() print('Unwrapped the big single div holding all contents.') def unwarpPossibleDiv_basic(): nonlocal soup modifiedTagCount = 0 for divTag in soup.find_all('div'): if canBeUnwrap(divTag): divTag.unwrap() modifiedTagCount += 1 if modifiedTagCount > 0: reloadSoup() print('Unwrapped %d div tags.' % modifiedTagCount) def unwarpPossibleDiv_experimental(): nonlocal soup modifiedTagCount = 0 pNodes = [] for divTag in soup.find_all('div'): if canBeUnwrap(divTag): pNodes.append(divTag) modifiedTagCount += 1 splitNodesIntoP(pNodes) if modifiedTagCount > 0: reloadSoup() print('Unwrapped %d div tags.' % modifiedTagCount) # remove empty span # do this before wrap stray tags def removeEmptySpan(): def removeEmptySpanSub(spanTag): if spanTag.parent is None: return False modified = False if containChildTags(spanTag, ['span']): for subSpanTag in spanTag.find_all(['span']): changed = removeEmptySpanSub(subSpanTag) if changed: modified = True if spanTag.get_text().strip() == '' and not containChildTags( spanTag, ['span', 'img' ]): # if it still has some span, don't decompose it spanTag.unwrap() modified = True elif not (spanTag.get('style') or (spanTag.get('id') and spanTag.get('id').startswith('_Toc'))): spanTag.unwrap() modified = True elif spanTag.get('style') and ( spanTag.get('style').strip() == "font-weight: 400;" or spanTag.get('style').strip() == ""): spanTag.unwrap() modified = True return modified nonlocal soup plsWriteBack = False for spanTag in soup.find_all(['span']): modified = removeEmptySpanSub(spanTag) if modified: plsWriteBack = True if plsWriteBack: reloadSoup() def wrapStrayText_basic(): nonlocal soup # wrap stray (direct decendant of body) <br>/<span>/<a>, text formatting tags and text in <p> (krytykal/skythewood/imoutolicious source) phantomWrapped = 0 removeMe = [] for child in soup.body.contents: if type(child) == sigil_bs4.element.NavigableString: # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking if str(child).strip() != '': child.wrap(soup.new_tag( 'p'))['class'] = 'baka_epub_stray_elements' phantomWrapped += 1 else: child.replace_with( '\n' ) # eliminate blank stray texts that aren't newline or true white spaces elif type(child) == sigil_bs4.element.Tag: if child.name in ['br', 'a']: child.wrap(soup.new_tag( 'p'))['class'] = 'baka_epub_stray_elements' phantomWrapped += 1 elif child.name in [ 'span', 'b', 'strong', 'i', 'em', 'big', 'small', 'mark', 's', 'strike', 'del', 'ins', 'sub', 'sup', 'u' ]: # for these, check if they have some contents. remove if no if (len(child.get_text().strip()) > 0 or len(child.find_all(True)) > 0): child.wrap(soup.new_tag( 'p'))['class'] = 'baka_epub_stray_elements' else: removeMe.append(child) phantomWrapped += 1 if phantomWrapped > 0: for element in removeMe: element.decompose() reloadSoup() print( 'Wrapped %d stray <br>/<span>/<a>, text formatting tags and texts in <p>.' % phantomWrapped) def wrapStrayText_experimental(): nonlocal soup splitNodesIntoP((soup.body, )) def removeEmptyP(): nonlocal soup plsWriteBack = False for spanTag in soup.find_all(['p']): # remove empty p if spanTag.get_text().strip() == '' and len( spanTag.find_all(['img'])) == 0: spanTag.decompose() plsWriteBack = True if plsWriteBack: reloadSoup() cleanUpForWordpress() cleanUpForTruyenFull() unwarpSingleBigDiv() easyClean1() easyClean2() removeEmptyStyleAttr() stripHeaderFormattings() fixBadIBUusage() removedNoDisplayDiv() convertPossibleDivToP() unwarpPossibleDiv_experimental() removeEmptySpan() wrapStrayText_experimental() textContents = soup.serialize_xhtml() # strip all comments textContents = re.sub('<!--(.*?)-->', '', textContents, flags=re.DOTALL) bk.writefile(textID, textContents) print('Done.') return 0
def splitNodesIntoP(pNodes): nonlocal soup # level 1: # try to split <p>line 1<br/><br/>line 2<img alt='' src='image.jpg'/>line3</p> # into <p>line 1</p> <p>line 2</p> <p><img alt='' src='image.jpg'/></p> <p>line3</p> # Remember to copy style and class. ID goes to the first p # level 2: # into <p>line 1</p> <p></p> <p>line 2</p> <p><img alt='' src='image.jpg'/></p> <p>line3</p> # level x: # try to handle <br/> nested inside something else like <p>line 1 <i>italic text<br/>line 2 in italic</i></p> # into <p>line 1 <i>italic text</i></p> <p><i>line 2 in italic</i></p> # current at level 2, but empty lines are removed at later stage so it doesn't even matter # TODO: copy style, class, id unwrapUsID = [] for textNode in pNodes: # for now, we put all new p in a container (and unwrap it later) newTextNode = soup.new_tag('div') newTextNode_id = 'id-' + str(uuid.uuid4()) newTextNode['id'] = newTextNode_id unwrapUsID.append(newTextNode_id) # removeMe = [] previousP = None lastChildWasBr = False for child in textNode.contents: if type(child) == sigil_bs4.element.NavigableString: lastChildWasBr = False # a lot of unwanted `<p> </p>` line will be created if you wrap everything without checking if str(child).strip() != '': if previousP: previousP.append(copy.copy(child)) else: child = copy.copy(child) newTextNode.append(child) previousP = child.wrap(soup.new_tag('p')) else: newTextNode.append( copy.copy(child)) # yes, copy even blank space elif type(child) == sigil_bs4.element.Tag: if child.name == 'br': if lastChildWasBr: newTextNode.append(soup.new_tag('p')) lastChildWasBr = True previousP = None elif child.name == 'img': child = copy.copy(child) newTextNode.append(child) tmpNode = child.wrap(soup.new_tag('div')) tmpNode['class'] = "svg_outer svg_inner" lastChildWasBr = False previousP = None elif child.name not in tagsNotAllowedInP: lastChildWasBr = False # for these, check if they have some contents. skip copying if no if (len(child.get_text().strip()) > 0 or len(child.find_all(True)) > 0 ) or child.has_attr('id') or child.has_attr( 'name'): if previousP: previousP.append(copy.copy(child)) else: child = copy.copy(child) newTextNode.append(child) previousP = child.wrap(soup.new_tag('p')) else: lastChildWasBr = False # stuff not allowed in <p> child = copy.copy(child) newTextNode.append(child) textNode.replace_with(newTextNode) html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html) for node in soup.body.find_all('div'): if node.get('id') in unwrapUsID: node.unwrap() html = soup.serialize_xhtml() soup = gumbo_bs4.parse(html)