def chapter_head(self, match): from ebook_converter.utils.html2text import html2text chap = match.group('chap') title = match.group('title') if not title: self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked %s chapters. - %s", self.html_preprocess_sections, str(chap)) return '<h2>' + chap + '</h2>\n' else: delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$') delete_quotes = re.compile('\'\"') txt_chap = delete_quotes.sub( '', delete_whitespace.sub('\\g<c>', html2text(chap))) txt_title = delete_quotes.sub( '', delete_whitespace.sub('\\g<c>', html2text(title))) self.html_preprocess_sections = self.html_preprocess_sections + 1 self.log.debug("marked %s chapters & titles. - %s, %s", self.html_preprocess_sections, chap, title) return '<h2 title="' + txt_chap + ', ' + txt_title + '">' + chap + '</h2>\n<h3 class="sigilNotInTOC">' + title + '</h3>\n'
def markup_user_break(self, replacement_break): ''' Takes string a user supplies and wraps it in markup that will be centered with appropriate margins. <hr> and <img> tags are allowed. If the user specifies a style with width attributes in the <hr> tag then the appropriate margins are applied to wrapping divs. This is because many ebook devices don't support margin:auto All other html is converted to text. ''' hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">' if re.findall('(<|>)', replacement_break): if re.match('^<hr', replacement_break): if replacement_break.find('width') != -1: try: width = int( re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break)) except: scene_break = hr_open + '<hr style="height: 3px; background:#505050" /></div>' self.log.warning('Invalid replacement scene break' ' expression, using default') else: replacement_break = re.sub( '(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break) divpercent = (100 - width) // 2 hr_open = re.sub('45', str(divpercent), hr_open) scene_break = hr_open + replacement_break + '</div>' else: scene_break = hr_open + '<hr style="height: 3px; background:#505050" /></div>' elif re.match('^<img', replacement_break): scene_break = self.scene_break_open + replacement_break + '</p>' else: from ebook_converter.utils.html2text import html2text replacement_break = html2text(replacement_break) replacement_break = re.sub('\\s', ' ', replacement_break) scene_break = self.scene_break_open + replacement_break + '</p>' else: replacement_break = re.sub('\\s', ' ', replacement_break) scene_break = self.scene_break_open + replacement_break + '</p>' return scene_break
def create_bibtex_entry(entry, fields, mode, template_citation, bibtexdict, db, citation_bibtex=True, calibre_files=True): # Bibtex doesn't like UTF-8 but keep unicode until writing # Define starting chain or if book valid strict and not book return a Fail string bibtex_entry = [] if mode != "misc" and check_entry_book_valid(entry): bibtex_entry.append('@book{') elif mode != "book": bibtex_entry.append('@misc{') else: # case strict book return '' if citation_bibtex: # Citation tag bibtex_entry.append( make_bibtex_citation(entry, template_citation, bibtexdict)) bibtex_entry = [' '.join(bibtex_entry)] for field in fields: if field.startswith('#'): item = db.get_field(entry['id'], field, index_is_id=True) if isinstance(item, (bool, numbers.Number)): item = repr(item) elif field == 'title_sort': item = entry['sort'] elif field == 'library_name': item = library_name else: item = entry[field] # check if the field should be included (none or empty) if item is None: continue try: if len(item) == 0: continue except TypeError: pass if field == 'authors': bibtex_entry.append('author = "%s"' % bibtexdict.bibtex_author_format(item)) elif field == 'id': bibtex_entry.append('calibreid = "%s"' % int(item)) elif field == 'rating': bibtex_entry.append('rating = "%s"' % int(item)) elif field == 'size': bibtex_entry.append('%s = "%s octets"' % (field, int(item))) elif field == 'tags': # A list to flatten bibtex_entry.append( 'tags = "%s"' % bibtexdict.utf8ToBibtex(', '.join(item))) elif field == 'comments': # \n removal item = item.replace('\r\n', ' ') item = item.replace('\n', ' ') # unmatched brace removal (users should use \leftbrace or \rightbrace for single braces) item = bibtexdict.stripUnmatchedSyntax(item, '{', '}') # html to text try: item = html2text(item) except: log.warn("Failed to convert comments to text") bibtex_entry.append('note = "%s"' % bibtexdict.utf8ToBibtex(item)) elif field == 'isbn': # Could be 9, 10 or 13 digits bibtex_entry.append('isbn = "%s"' % format_isbn(item)) elif field == 'formats': # Add file path if format is selected formats = [ format.rpartition('.')[2].lower() for format in item ] bibtex_entry.append('formats = "%s"' % ', '.join(formats)) if calibre_files: files = [ ':%s:%s' % (format, format.rpartition('.')[2].upper()) for format in item ] bibtex_entry.append('file = "%s"' % ', '.join(files)) elif field == 'series_index': bibtex_entry.append('volume = "%s"' % int(item)) elif field == 'timestamp': bibtex_entry.append('timestamp = "%s"' % isoformat(item).partition('T')[0]) elif field == 'pubdate': bibtex_entry.append('year = "%s"' % item.year) bibtex_entry.append( 'month = "%s"' % bibtexdict.utf8ToBibtex(date.strftime("%b", item))) elif field.startswith('#') and isinstance(item, (str, bytes)): bibtex_entry.append( 'custom_%s = "%s"' % (field[1:], bibtexdict.utf8ToBibtex(item))) elif isinstance(item, (str, bytes)): # elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice', # 'author_sort', 'series', 'title_sort'] : bibtex_entry.append('%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item))) bibtex_entry = ',\n '.join(bibtex_entry) bibtex_entry += ' }\n\n' return bibtex_entry
def fb2_header(self): metadata = {} metadata['title'] = self.oeb_book.metadata.title[0].value metadata['appname'] = __appname__ metadata['version'] = __version__ metadata['date'] = '%i.%i.%i' % ( datetime.now().day, datetime.now().month, datetime.now().year) if self.oeb_book.metadata.language: lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value) if not lc: lc = self.oeb_book.metadata.language[0].value metadata['lang'] = lc or 'en' else: metadata['lang'] = u'en' metadata['id'] = None metadata['cover'] = self.get_cover() metadata['genre'] = self.opts.fb2_genre metadata['author'] = '' for auth in self.oeb_book.metadata.creator: author_first = '' author_middle = '' author_last = '' author_parts = auth.value.split(' ') if len(author_parts) == 1: author_last = author_parts[0] elif len(author_parts) == 2: author_first = author_parts[0] author_last = author_parts[1] else: author_first = author_parts[0] author_middle = ' '.join(author_parts[1:-1]) author_last = author_parts[-1] metadata['author'] += '<author>' metadata['author'] += ('<first-name>%s</first-name>' % prepare_string_for_xml(author_first)) if author_middle: metadata['author'] += ('<middle-name>%s</middle-name>' % prepare_string_for_xml(author_middle)) metadata['author'] += ('<last-name>%s</last-name>' % prepare_string_for_xml(author_last)) metadata['author'] += '</author>' if not metadata['author']: metadata['author'] = ('<author><first-name></first-name>' '<last-name></last-name></author>') metadata['keywords'] = '' tags = list(map(str, self.oeb_book.metadata.subject)) if tags: tags = ', '.join(prepare_string_for_xml(x) for x in tags) metadata['keywords'] = '<keywords>%s</keywords>' % tags metadata['sequence'] = '' if self.oeb_book.metadata.series: index = '1' if self.oeb_book.metadata.series_index: index = self.oeb_book.metadata.series_index[0] seq = prepare_string_for_xml(str(self.oeb_book.metadata.series[0])) metadata['sequence'] = ('<sequence name="%s" number="%s"/>' % (seq, index)) year = publisher = isbn = '' identifiers = self.oeb_book.metadata['identifier'] for x in identifiers: if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:')): metadata['id'] = str(x).split(':')[-1] break if metadata['id'] is None: self.log.warn('No UUID identifier found') metadata['id'] = str(uuid.uuid4()) try: date = self.oeb_book.metadata['date'][0] except IndexError: pass else: year = ('<year>%s</year>' % prepare_string_for_xml(date.value.partition('-')[0])) try: publisher = self.oeb_book.metadata['publisher'][0] except IndexError: pass else: publisher = ('<publisher>%s</publisher>' % prepare_string_for_xml(publisher.value)) for x in identifiers: if x.get(base.tag('opf', 'scheme'), None).lower() == 'isbn': isbn = '<isbn>%s</isbn>' % prepare_string_for_xml(x.value) metadata['year'] = year metadata['isbn'] = isbn metadata['publisher'] = publisher for key, value in metadata.items(): if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'): metadata[key] = prepare_string_for_xml(value) try: comments = self.oeb_book.metadata['description'][0] except Exception: metadata['comments'] = '' else: from ebook_converter.utils.html2text import html2text annot = prepare_string_for_xml(html2text(comments.value).strip()) metadata['comments'] = f'<annotation><p>{annot}</p></annotation>' # Keep the indentation level of the description the same as the body. header = textwrap.dedent('''\ <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink"> <description> <title-info> <genre>%(genre)s</genre> %(author)s <book-title>%(title)s</book-title> %(cover)s <lang>%(lang)s</lang> %(keywords)s %(sequence)s %(comments)s </title-info> <document-info> %(author)s <program-used>%(appname)s %(version)s</program-used> <date>%(date)s</date> <id>%(id)s</id> <version>1.0</version> </document-info> <publish-info> %(publisher)s %(year)s %(isbn)s </publish-info> </description>''') % metadata # Remove empty lines. return '\n'.join(filter(str.strip, header.splitlines()))
def sanitize_comments_html(html): from ebook_converter.ebooks.markdown import Markdown text = html2text(html) md = Markdown() html = md.convert(text) return html
def run(self, path_to_output, opts, db, notification=DummyReporter()): from ebook_converter.library import current_library_name from ebook_converter.utils.date import isoformat from ebook_converter.utils.html2text import html2text from ebook_converter.utils.logging import default_log as log from lxml import etree from ebook_converter.ebooks.metadata import authors_to_string self.fmt = path_to_output.rpartition('.')[2] self.notification = notification current_library = current_library_name() if getattr(opts, 'library_path', None): current_library = os.path.basename(opts.library_path) if opts.verbose: opts_dict = vars(opts) log("%s('%s'): Generating %s" % (self.name, current_library, self.fmt.upper())) if opts.connected_device['is_device_connected']: log(" connected_device: %s" % opts.connected_device['name']) if opts_dict['search_text']: log(" --search='%s'" % opts_dict['search_text']) if opts_dict['ids']: log(" Book count: %d" % len(opts_dict['ids'])) if opts_dict['search_text']: log(" (--search ignored when a subset of the database is specified)" ) if opts_dict['fields']: if opts_dict['fields'] == 'all': log(" Fields: %s" % ', '.join(FIELDS[1:])) else: log(" Fields: %s" % opts_dict['fields']) # If a list of ids are provided, don't use search_text if opts.ids: opts.search_text = None data = self.search_sort_db(db, opts) if not len(data): log.error( "\nNo matching database entries for search criteria '%s'" % opts.search_text) # raise SystemExit(1) # Get the requested output fields as a list fields = self.get_output_fields(db, opts) # If connected device, add 'On Device' values to data if opts.connected_device[ 'is_device_connected'] and 'ondevice' in fields: for entry in data: entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[ entry['id']]['ondevice'] fm = {x: db.field_metadata.get(x, {}) for x in fields} if self.fmt == 'csv': outfile = codecs.open(path_to_output, 'w', 'utf8') # Write a UTF-8 BOM outfile.write('\ufeff') # Output the field headers outfile.write('%s\n' % ','.join(fields)) # Output the entry fields for entry in data: outstr = [] for field in fields: if field.startswith('#'): item = db.get_field(entry['id'], field, index_is_id=True) if isinstance(item, (list, tuple)): if fm.get(field, {}).get('display', {}).get('is_names', False): item = ' & '.join(item) else: item = ', '.join(item) elif field == 'library_name': item = current_library elif field == 'title_sort': item = entry['sort'] else: item = entry[field] if item is None: outstr.append('""') continue elif field == 'formats': fmt_list = [] for format in item: fmt_list.append(format.rpartition('.')[2].lower()) item = ', '.join(fmt_list) elif field == 'authors': item = authors_to_string(item) elif field == 'tags': item = ', '.join(item) elif field == 'isbn': # Could be 9, 10 or 13 digits, with hyphens, possibly ending in 'X' item = '%s' % re.sub(r'[^\dX-]', '', item) elif fm.get(field, {}).get('datatype') == 'datetime': item = isoformat(item, as_utc=False) elif field == 'comments': item = item.replace('\r\n', ' ') item = item.replace('\n', ' ') elif fm.get(field, {}).get('datatype', None) == 'rating' and item: item = '%.2g' % (item / 2) # Convert HTML to markdown text if isinstance(item, str): opening_tag = re.search(r'<(\w+)( |>)', item) if opening_tag: closing_tag = re.search( r'<\/%s>$' % opening_tag.group(1), item) if closing_tag: item = html2text(item) outstr.append('"%s"' % str(item).replace('"', '""')) outfile.write(','.join(outstr) + '\n') outfile.close() elif self.fmt == 'xml': from lxml.builder import E root = E.calibredb() for r in data: record = E.record() root.append(record) for field in fields: if field.startswith('#'): val = db.get_field(r['id'], field, index_is_id=True) if not isinstance(val, str): val = str(val) item = getattr(E, field.replace('#', '_'))(val) record.append(item) for field in ('id', 'uuid', 'publisher', 'rating', 'size', 'isbn', 'ondevice', 'identifiers'): if field in fields: val = r[field] if not val: continue if not isinstance(val, (bytes, str)): if (fm.get(field, {}).get('datatype', None) == 'rating' and val): val = '%.2g' % (val / 2) val = str(val) item = getattr(E, field)(val) record.append(item) if 'title' in fields: title = E.title(r['title'], sort=r['sort']) record.append(title) if 'authors' in fields: aus = E.authors(sort=r['author_sort']) for au in r['authors']: aus.append(E.author(au)) record.append(aus) for field in ('timestamp', 'pubdate'): if field in fields: record.append( getattr(E, field)(isoformat(r[field], as_utc=False))) if 'tags' in fields and r['tags']: tags = E.tags() for tag in r['tags']: tags.append(E.tag(tag)) record.append(tags) if 'comments' in fields and r['comments']: record.append(E.comments(r['comments'])) if 'series' in fields and r['series']: record.append( E.series(r['series'], index=str(r['series_index']))) if 'cover' in fields and r['cover']: record.append(E.cover(r['cover'].replace(os.sep, '/'))) if 'formats' in fields and r['formats']: fmt = E.formats() for f in r['formats']: fmt.append(E.format(f.replace(os.sep, '/'))) record.append(fmt) if 'library_name' in fields: record.append(E.library_name(current_library)) with open(path_to_output, 'wb') as f: f.write( etree.tostring(root, encoding='utf-8', xml_declaration=True, pretty_print=True))
def _set_comments(title_info, mi, ctx): if not mi.is_null('comments'): from ebook_converter.utils.html2text import html2text ctx.clear_meta_tags(title_info, 'annotation') title = ctx.get_or_create(title_info, 'annotation') ctx.text2fb2(title, html2text(mi.comments))