def extract_content(self, output_dir): txt = '' self.log.info(u'Decompressing text...') for i in range(1, self.header_record.num_records + 1): self.log.debug(u'\tDecompressing text section %i' % i) title = self.header_record.chapter_titles[i - 1] lines = [] title_added = False for line in self.decompress_text(i).splitlines(): line = fix_punct(line) line = line.strip() if not title_added and title in line: line = '<h1 class="chapter">' + line + '</h1>\n' title_added = True else: line = prepare_string_for_xml(line) lines.append('<p>%s</p>' % line) if not title_added: lines.insert(0, '<h1 class="chapter">' + title + '</h1>\n') txt += '\n'.join(lines) self.log.info(u'Converting text to OEB...') html = HTML_TEMPLATE % (self.header_record.title, txt) with open(os.path.join(output_dir, 'index.html'), 'wb') as index: index.write(html.encode('utf-8')) mi = self.get_metadata() manifest = [('index.html', None)] spine = ['index.html'] opf_writer(output_dir, 'metadata.opf', manifest, spine, mi) return os.path.join(output_dir, 'metadata.opf')
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): p = elem.getparent() if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] tags = [] tag = parse_utils.barename(elem.tag) attribs = elem.attrib if tag == 'body': tag = 'div' tags.append(tag) # Remove attributes we won't want. if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def prepare_string_for_html(self, raw): raw = prepare_string_for_xml(raw) raw = raw.replace(u'\u00ad', '­') raw = raw.replace(u'\u2014', '—') raw = raw.replace(u'\u2013', '–') raw = raw.replace(u'\u00a0', ' ') return raw
def mlize_spine(self, oeb_book): output = [ u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><title>%s</title></head><body>' % (prepare_string_for_xml(self.book_title)) ] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) self.rewrite_ids(item.data, item) base.rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item) output.append('\n\n') output.append('</body></html>') return ''.join(output)
def mlize_spine(self, oeb_book): output = [] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) self.rewrite_ids(item.data, item) base.rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item) output.append('\n\n') if self.opts.htmlz_class_style == 'external': css = u'<link href="style.css" rel="stylesheet" type="text/css" />' else: css = u'<style type="text/css">' + self.get_css( oeb_book) + u'</style>' title = u'<title>%s</title>' % prepare_string_for_xml(self.book_title) output = [u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'] + \ [css] + [title, u'</head><body>'] + output + [u'</body></html>'] return ''.join(output)
def convert_basic(txt, title='', epub_split_size_kb=0): ''' Converts plain text to html by putting all paragraphs in <p> tags. It condense and retains blank lines when necessary. Requires paragraphs to be in single line format. ''' txt = clean_txt(txt) txt = split_txt(txt, epub_split_size_kb) lines = [] blank_count = 0 # Split into paragraphs based on having a blank line between text. for line in txt.split('\n'): if line.strip(): blank_count = 0 lines.append(u'<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' '))) else: blank_count += 1 if blank_count == 2: lines.append(u'<p> </p>') return HTML_TEMPLATE % (title, u'\n'.join(lines))
def __str__(self): s = '' open_containers = collections.deque() for c in self.content: if isinstance(c, str): s += prepare_string_for_xml(c).replace('\0', '') elif c is None: if open_containers: p = open_containers.pop() s += '</%s>' % (p.name, ) else: s += str(c) if not c.self_closing: open_containers.append(c) if len(open_containers) > 0: if len(open_containers) == 1: s += '</%s>' % (open_containers[0].name, ) else: raise LRFParseError('Malformed text stream %s' % ([ i.name for i in open_containers if isinstance(i, Text.TextTag) ], )) return s
def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): """ This function is intended to be used in a recursive manner. dump_text will run though all elements in the elem_tree and call itself on each element. self.image_hrefs will be populated by calling this function. @param elem_tree: etree representation of XHTML content to be transformed. @param stylizer: Used to track the style of elements within the tree. @param page: OEB page used to determine absolute urls. @param tag_stack: List of open FB2 tags to take into account. @return: List of string representing the XHTML converted to FB2 markup. """ elem = elem_tree # Ensure what we are converting is not a string and that the fist tag # is part of the XHTML namespace. if (not isinstance(elem_tree.tag, (str, bytes)) or parse_utils.namespace(elem_tree.tag) != const.XHTML_NS): p = elem.getparent() if (p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) == const.XHTML_NS and elem.tail): return [elem.tail] return [] style = stylizer.style(elem_tree) if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': if hasattr(elem, 'tail') and elem.tail: return [elem.tail] return [] # FB2 generated output. fb2_out = [] # FB2 tags in the order they are opened. This will be used to close # the tags. tags = [] # First tag in tree tag = parse_utils.barename(elem_tree.tag) # Number of blank lines above tag try: ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems < 0: ems = 0 except Exception: ems = 0 # Convert TOC entries to <title>s and add <section>s if self.opts.sectionize == 'toc': # A section cannot be a child of any other element than another # section, so leave the tag alone if there are parents if not tag_stack: # There are two reasons to start a new section here: the TOC # pointed to this page (then we use the first non-<body> on # the page as a <title>), or the TOC pointed to a specific # element newlevel = 0 toc_entry = self.toc.get(page.href, None) if toc_entry is not None: if None in toc_entry: if (tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text): newlevel = 1 self.toc[page.href] = None if (not newlevel and elem_tree.attrib.get('id', None) is not None): newlevel = toc_entry.get( elem_tree.attrib.get('id', None), None) # Start a new section if necessary if newlevel: while newlevel <= self.section_level: fb2_out.append('</section>') self.section_level -= 1 fb2_out.append('<section>') self.section_level += 1 fb2_out.append('<title>') tags.append('title') if self.section_level == 0: # If none of the prior processing made a section, make one now # to be FB2 spec compliant fb2_out.append('<section>') self.section_level += 1 # Process the XHTML tag and styles. Converted to an FB2 tag. # Use individual if statement not if else. There can be only one XHTML # tag but it can have multiple styles. if tag == 'img' and elem_tree.attrib.get('src', None): # Only write the image tag if it is in the manifest. ihref = base.urlnormalize(page.abshref(elem_tree.attrib['src'])) if ihref in self.oeb_book.manifest.hrefs: if ihref not in self.image_hrefs: self.image_hrefs[ihref] = 'img_%s' % len(self.image_hrefs) p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<image l:href="#%s"/>' % self.image_hrefs[ihref]) else: self.log.warn(u'Ignoring image not in manifest: %s' % ihref) if tag in ('br', 'hr') or ems >= 1: if ems < 1: multiplier = 1 else: multiplier = ems if self.in_p: closed_tags = [] open_tags = tag_stack + tags open_tags.reverse() for t in open_tags: fb2_out.append('</%s>' % t) closed_tags.append(t) if t == 'p': break fb2_out.append('<empty-line/>' * multiplier) closed_tags.reverse() for t in closed_tags: fb2_out.append('<%s>' % t) else: fb2_out.append('<empty-line/>' * multiplier) if tag in ('div', 'li', 'p'): p_text, added_p = self.close_open_p(tag_stack + tags) fb2_out += p_text if added_p: tags.append('p') if tag == 'a' and elem_tree.attrib.get('href', None): # Handle only external links for now if urllib.parse.urlparse(elem_tree.attrib['href']).netloc: p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag fb2_out.append('<a l:href="%s">' % base.urlnormalize(elem_tree.attrib['href'])) tags.append('a') if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): s_out, s_tags = self.handle_simple_tag('strong', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'i' or style['font-style'] == 'italic': s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack + tags) fb2_out += s_out tags += s_tags if (tag in ('del', 'strike') or style['text-decoration'] == 'line-through'): s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'sub': s_out, s_tags = self.handle_simple_tag('sub', tag_stack + tags) fb2_out += s_out tags += s_tags if tag == 'sup': s_out, s_tags = self.handle_simple_tag('sup', tag_stack + tags) fb2_out += s_out tags += s_tags # Process element text. if hasattr(elem_tree, 'text') and elem_tree.text: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.text)) if not self.in_p: fb2_out.append('</p>') # Process sub-elements. for item in elem_tree: fb2_out += self.dump_text(item, stylizer, page, tag_stack + tags) # Close open FB2 tags. tags.reverse() fb2_out += self.close_tags(tags) # Process element text that comes after the close of the XHTML tag but # before the next XHTML tag. if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: fb2_out.append('<p>') fb2_out.append(prepare_string_for_xml(elem_tree.tail)) if not self.in_p: fb2_out.append('</p>') return fb2_out
def fb2_header(self): metadata = {} metadata['title'] = self.oeb_book.metadata.title[0].value metadata['appname'] = __appname__ metadata['version'] = __version__ metadata['date'] = '%i.%i.%i' % ( datetime.now().day, datetime.now().month, datetime.now().year) if self.oeb_book.metadata.language: lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value) if not lc: lc = self.oeb_book.metadata.language[0].value metadata['lang'] = lc or 'en' else: metadata['lang'] = u'en' metadata['id'] = None metadata['cover'] = self.get_cover() metadata['genre'] = self.opts.fb2_genre metadata['author'] = '' for auth in self.oeb_book.metadata.creator: author_first = '' author_middle = '' author_last = '' author_parts = auth.value.split(' ') if len(author_parts) == 1: author_last = author_parts[0] elif len(author_parts) == 2: author_first = author_parts[0] author_last = author_parts[1] else: author_first = author_parts[0] author_middle = ' '.join(author_parts[1:-1]) author_last = author_parts[-1] metadata['author'] += '<author>' metadata['author'] += ('<first-name>%s</first-name>' % prepare_string_for_xml(author_first)) if author_middle: metadata['author'] += ('<middle-name>%s</middle-name>' % prepare_string_for_xml(author_middle)) metadata['author'] += ('<last-name>%s</last-name>' % prepare_string_for_xml(author_last)) metadata['author'] += '</author>' if not metadata['author']: metadata['author'] = ('<author><first-name></first-name>' '<last-name></last-name></author>') metadata['keywords'] = '' tags = list(map(str, self.oeb_book.metadata.subject)) if tags: tags = ', '.join(prepare_string_for_xml(x) for x in tags) metadata['keywords'] = '<keywords>%s</keywords>' % tags metadata['sequence'] = '' if self.oeb_book.metadata.series: index = '1' if self.oeb_book.metadata.series_index: index = self.oeb_book.metadata.series_index[0] seq = prepare_string_for_xml(str(self.oeb_book.metadata.series[0])) metadata['sequence'] = ('<sequence name="%s" number="%s"/>' % (seq, index)) year = publisher = isbn = '' identifiers = self.oeb_book.metadata['identifier'] for x in identifiers: if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:')): metadata['id'] = str(x).split(':')[-1] break if metadata['id'] is None: self.log.warn('No UUID identifier found') metadata['id'] = str(uuid.uuid4()) try: date = self.oeb_book.metadata['date'][0] except IndexError: pass else: year = ('<year>%s</year>' % prepare_string_for_xml(date.value.partition('-')[0])) try: publisher = self.oeb_book.metadata['publisher'][0] except IndexError: pass else: publisher = ('<publisher>%s</publisher>' % prepare_string_for_xml(publisher.value)) for x in identifiers: if x.get(base.tag('opf', 'scheme'), None).lower() == 'isbn': isbn = '<isbn>%s</isbn>' % prepare_string_for_xml(x.value) metadata['year'] = year metadata['isbn'] = isbn metadata['publisher'] = publisher for key, value in metadata.items(): if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'): metadata[key] = prepare_string_for_xml(value) try: comments = self.oeb_book.metadata['description'][0] except Exception: metadata['comments'] = '' else: from ebook_converter.utils.html2text import html2text annot = prepare_string_for_xml(html2text(comments.value).strip()) metadata['comments'] = f'<annotation><p>{annot}</p></annotation>' # Keep the indentation level of the description the same as the body. header = textwrap.dedent('''\ <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink"> <description> <title-info> <genre>%(genre)s</genre> %(author)s <book-title>%(title)s</book-title> %(cover)s <lang>%(lang)s</lang> %(keywords)s %(sequence)s %(comments)s </title-info> <document-info> %(author)s <program-used>%(appname)s %(version)s</program-used> <date>%(date)s</date> <id>%(id)s</id> <version>1.0</version> </document-info> <publish-info> %(publisher)s %(year)s %(isbn)s </publish-info> </description>''') % metadata # Remove empty lines. return '\n'.join(filter(str.strip, header.splitlines()))
def comments_to_html(comments): ''' Convert random comment text to normalized, xml-legal block of <p>s 'plain text' returns as <p>plain text</p> 'plain text with <i>minimal</i> <b>markup</b>' returns as <p>plain text with <i>minimal</i> <b>markup</b></p> '<p>pre-formatted text</p> returns untouched 'A line of text\n\nFollowed by a line of text' returns as <p>A line of text</p> <p>Followed by a line of text</p> 'A line of text.\nA second line of text.\rA third line of text' returns as <p>A line of text.<br />A second line of text.<br />A third line of text.</p> '...end of a paragraph.Somehow the break was lost...' returns as <p>...end of a paragraph.</p> <p>Somehow the break was lost...</p> Deprecated HTML returns as HTML via BeautifulSoup() ''' if not comments: return u'<p></p>' if not isinstance(comments, str): comments = comments.decode(preferred_encoding, 'replace') if comments.lstrip().startswith('<'): # Comment is already HTML do not mess with it return comments if '<' not in comments: comments = prepare_string_for_xml(comments) parts = [ u'<p class="description">%s</p>' % x.replace(u'\n', u'<br />') for x in comments.split('\n\n') ] return '\n'.join(parts) if sanitize_pat.search(comments) is not None: try: return sanitize_comments_html(comments) except: import traceback traceback.print_exc() return u'<p></p>' # Explode lost CRs to \n\n comments = lost_cr_exception_pat.sub( lambda m: m.group().replace('.', '.\r'), comments) for lost_cr in lost_cr_pat.finditer(comments): comments = comments.replace( lost_cr.group(), '%s%s\n\n%s' % (lost_cr.group(1), lost_cr.group(2), lost_cr.group(3))) comments = comments.replace(u'\r', u'') # Convert \n\n to <p>s comments = comments.replace(u'\n\n', u'<p>') # Convert solo returns to <br /> comments = comments.replace(u'\n', '<br />') # Convert two hyphens to emdash comments = comments.replace('--', '—') soup = html5_parser('<div>' + comments + '</div>').find('div') result = html5_parser('<div>') container = result.find('div') rtc = 0 open_pTag = False all_tokens = list(soup.contents) inline_tags = ('br', 'b', 'i', 'em', 'strong', 'span', 'font', 'a', 'hr') for token in all_tokens: if isinstance(token, (bs4.CData, bs4.Comment, bs4.Declaration, bs4.ProcessingInstruction)): continue if isinstance(token, bs4.NavigableString): if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 elif token.name in inline_tags: if not open_pTag: pTag = result.new_tag('p') open_pTag = True ptc = 0 pTag.insert(ptc, token) ptc += 1 else: if open_pTag: container.insert(rtc, pTag) rtc += 1 open_pTag = False ptc = 0 container.insert(rtc, token) rtc += 1 if open_pTag: container.insert(rtc, pTag) for p in container.findAll('p'): p['class'] = 'description' return container.decode_contents()
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): p = elem.getparent() if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = parse_utils.barename(elem.tag) attribs = elem.attrib style_a = '%s' % style style_a = style_a if style_a else '' if tag == 'body': # Change the body to a div so we can merge multiple files. tag = 'div' # Add page-break-brefore: always because renders typically treat a new file (we're merging files) # as a page break and remove all other page break types that might be set. style_a = 'page-break-before: always; %s' % re.sub( 'page-break-[^:]+:[^;]+;?', '', style_a) # Remove unnecessary spaces. style_a = re.sub(r'\s{2,}', ' ', style_a).strip() tags.append(tag) # Remove attributes we won't want. if 'class' in attribs: del attribs['class'] if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Turn style into strings for putting in the tag. style_t = '' if style_a: style_t = ' style="%s"' % style_a.replace('"', "'") # Write the tag. text.append('<%s%s%s' % (tag, at, style_t)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text
def dump_text(self, elem, stylizer, page): ''' @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. ''' # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): p = elem.getparent() if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ and elem.tail: return [elem.tail] return [''] # Setup our variables. text = [''] style = stylizer.style(elem) tags = [] tag = parse_utils.barename(elem.tag) attribs = elem.attrib if tag == 'body': tag = 'div' tags.append(tag) # Ignore anything that is set to not be displayed. if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ or style['visibility'] == 'hidden': return [''] # Remove attributes we won't want. if 'class' in attribs: del attribs['class'] if 'style' in attribs: del attribs['style'] # Turn the rest of the attributes into a string we can write with the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) if tag in SELF_CLOSING_TAGS: text.append(' />') else: text.append('>') # Turn styles into tags. if style['font-weight'] in ('bold', 'bolder'): text.append('<b>') tags.append('b') if style['font-style'] == 'italic': text.append('<i>') tags.append('i') if style['text-decoration'] == 'underline': text.append('<u>') tags.append('u') if style['text-decoration'] == 'line-through': text.append('<s>') tags.append('s') # Process tags that contain text. if hasattr(elem, 'text') and elem.text: text.append(self.prepare_string_for_html(elem.text)) # Recurse down into tags within the tag we are in. for item in elem: text += self.dump_text(item, stylizer, page) # Close all open tags. tags.reverse() for t in tags: if t not in SELF_CLOSING_TAGS: text.append('</%s>' % t) # Add the text that is outside of the tag. if hasattr(elem, 'tail') and elem.tail: text.append(self.prepare_string_for_html(elem.tail)) return text