def get_download_filename_from_response(response): from polyglot.urllib import unquote, urlparse filename = last_part_name = '' try: purl = urlparse(response.geturl()) last_part_name = unquote(purl.path.split('/')[-1]) disposition = response.info().get('Content-disposition', '') if isinstance(disposition, bytes): disposition = disposition.decode('utf-8', 'replace') for p in disposition.split(';'): if 'filename' in p: if '*=' in disposition: parts = disposition.split('*=')[-1] filename = parts.split('\'')[-1] else: filename = disposition.split('=')[-1] if filename[0] in ('\'', '"'): filename = filename[1:] if filename[-1] in ('\'', '"'): filename = filename[:-1] filename = unquote(filename) break except Exception: import traceback traceback.print_exc() return filename or last_part_name
def get_download_filename_from_response(response): from polyglot.urllib import unquote, urlparse filename = last_part_name = '' try: purl = urlparse(response.geturl()) last_part_name = unquote(purl.path.split('/')[-1]) disposition = response.info().get('Content-disposition', '') if isinstance(disposition, bytes): disposition = disposition.decode('utf-8', 'replace') for p in disposition.split(';'): if 'filename' in p: if '*=' in disposition: parts = disposition.split('*=')[-1] filename = parts.split('\'')[-1] else: filename = disposition.split('=')[-1] if filename[0] in ('\'', '"'): filename = filename[1:] if filename[-1] in ('\'', '"'): filename = filename[:-1] filename = unquote(filename) break except Exception: import traceback traceback.print_exc() return filename or last_part_name
def __init__(self, href_or_path, basedir=os.getcwdu(), is_path=True): self._href = None self._basedir = basedir self.path = None self.fragment = '' try: self.mime_type = guess_type(href_or_path)[0] except: self.mime_type = None if self.mime_type is None: self.mime_type = 'application/octet-stream' if is_path: path = href_or_path if not os.path.isabs(path): path = os.path.abspath(os.path.join(basedir, path)) if isinstance(path, bytes): path = path.decode(sys.getfilesystemencoding()) self.path = path else: url = urlparse(href_or_path) if url[0] not in ('', 'file'): self._href = href_or_path else: pc = url[2] if isinstance(pc, unicode_type): pc = pc.encode('utf-8') pc = unquote(pc).decode('utf-8') self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) self.fragment = unquote(url[-1])
def __init__(self, href_or_path, basedir=os.getcwdu(), is_path=True): self._href = None self._basedir = basedir self.path = None self.fragment = '' try: self.mime_type = guess_type(href_or_path)[0] except: self.mime_type = None if self.mime_type is None: self.mime_type = 'application/octet-stream' if is_path: path = href_or_path if not os.path.isabs(path): path = os.path.abspath(os.path.join(basedir, path)) if isinstance(path, bytes): path = path.decode(sys.getfilesystemencoding()) self.path = path else: url = urlparse(href_or_path) if url[0] not in ('', 'file'): self._href = href_or_path else: pc = url[2] if isinstance(pc, unicode_type): pc = pc.encode('utf-8') pc = unquote(pc).decode('utf-8') self.path = os.path.abspath( os.path.join(basedir, pc.replace('/', os.sep))) self.fragment = unquote(url[-1])
def process_navpoint(np, dest): try: play_order = int(get_attr(np, 1)) except: play_order = 1 href = fragment = text = None nd = dest nl = nl_path(np) if nl: nl = nl[0] text = '' for txt in txt_path(nl): text += etree.tostring(txt, method='text', encoding='unicode', with_tail=False) content = content_path(np) if content and text: content = content[0] # if get_attr(content, attr='src'): purl = urlparse(content.get('src')) href, fragment = unquote(purl[2]), unquote(purl[5]) nd = dest.add_item(href, fragment, text) nd.play_order = play_order for c in np_path(np): process_navpoint(c, nd)
def process_navpoint(np, dest): try: play_order = int(get_attr(np, 1)) except: play_order = 1 href = fragment = text = None nd = dest nl = nl_path(np) if nl: nl = nl[0] text = u'' for txt in txt_path(nl): text += etree.tostring(txt, method='text', encoding='unicode', with_tail=False) content = content_path(np) if content and text: content = content[0] # if get_attr(content, attr='src'): purl = urlparse(content.get('src')) href, fragment = unquote(purl[2]), unquote(purl[5]) nd = dest.add_item(href, fragment, text) nd.play_order = play_order for c in np_path(np): process_navpoint(c, nd)
def parse_uri(uri, parse_query=True): scheme, authority, path = parse_request_uri(uri) if path is None: raise HTTPSimpleResponse(http_client.BAD_REQUEST, "No path component") if b'#' in path: raise HTTPSimpleResponse(http_client.BAD_REQUEST, "Illegal #fragment in Request-URI.") if scheme: try: scheme = scheme.decode('ascii') except ValueError: raise HTTPSimpleResponse(http_client.BAD_REQUEST, 'Un-decodeable scheme') path, qs = path.partition(b'?')[::2] if parse_query: try: query = MultiDict.create_from_query_string(qs) except Exception: raise HTTPSimpleResponse(http_client.BAD_REQUEST, 'Unparseable query string') else: query = None try: path = '%2F'.join( unquote(x).decode('utf-8') for x in quoted_slash.split(path)) except ValueError as e: raise HTTPSimpleResponse(http_client.BAD_REQUEST, as_unicode(e)) path = tuple(filter(None, (x.replace('%2F', '/') for x in path.split('/')))) return scheme, path, query
def parse_uri(uri, parse_query=True): scheme, authority, path = parse_request_uri(uri) if path is None: raise HTTPSimpleResponse(http_client.BAD_REQUEST, "No path component") if b'#' in path: raise HTTPSimpleResponse(http_client.BAD_REQUEST, "Illegal #fragment in Request-URI.") if scheme: try: scheme = scheme.decode('ascii') except ValueError: raise HTTPSimpleResponse(http_client.BAD_REQUEST, 'Un-decodeable scheme') path, qs = path.partition(b'?')[::2] if parse_query: try: query = MultiDict.create_from_query_string(qs) except Exception: raise HTTPSimpleResponse(http_client.BAD_REQUEST, 'Unparseable query string') else: query = None try: path = '%2F'.join(unquote(x).decode('utf-8') for x in quoted_slash.split(path)) except ValueError as e: raise HTTPSimpleResponse(http_client.BAD_REQUEST, as_unicode(e)) path = tuple(filter(None, (x.replace('%2F', '/') for x in path.split('/')))) return scheme, path, query
def dnd_get_image(md, image_exts=None): ''' Get the image in the QMimeData object md. :return: None, None if no image is found QPixmap, None if an image is found, the pixmap is guaranteed not null url, filename if a URL that points to an image is found ''' if md.hasImage(): for x in md.formats(): x = unicode_type(x) if x.startswith('image/'): cdata = bytes(md.data(x)) pmap = QPixmap() pmap.loadFromData(cdata) if not pmap.isNull(): return pmap, None break if md.hasFormat('application/octet-stream'): cdata = bytes(md.data('application/octet-stream')) pmap = QPixmap() pmap.loadFromData(cdata) if not pmap.isNull(): return pmap, None if image_exts is None: image_exts = image_extensions() # No image, look for an URL pointing to an image urls = urls_from_md(md) paths = [path_from_qurl(u) for u in urls] # First look for a local file images = [ xi for xi in paths if posixpath.splitext(unquote(xi))[1][1:].lower() in image_exts ] images = [xi for xi in images if os.path.exists(xi)] p = QPixmap() for path in images: try: with open(path, 'rb') as f: p.loadFromData(f.read()) except Exception: continue if not p.isNull(): return p, None # No local images, look for remote ones # First, see if this is from Firefox rurl, fname = get_firefox_rurl(md, image_exts) if rurl and fname: return rurl, fname # Look through all remaining URLs for remote_url, filename in remote_urls_from_qurl(urls, image_exts): return remote_url, filename return None, None
def path_from_qurl(qurl): raw = bytes(qurl.toEncoded( QUrl.PreferLocalFile | QUrl.RemoveScheme | QUrl.RemovePassword | QUrl.RemoveUserInfo | QUrl.RemovePort | QUrl.RemoveAuthority | QUrl.RemoveQuery | QUrl.RemoveFragment)) ans = as_unicode_polyglot(unquote(raw), errors='replace') if iswindows and ans.startswith('/'): ans = ans[1:] return ans
def dnd_get_image(md, image_exts=None): ''' Get the image in the QMimeData object md. :return: None, None if no image is found QPixmap, None if an image is found, the pixmap is guaranteed not null url, filename if a URL that points to an image is found ''' if md.hasImage(): for x in md.formats(): x = unicode_type(x) if x.startswith('image/'): cdata = bytes(md.data(x)) pmap = QPixmap() pmap.loadFromData(cdata) if not pmap.isNull(): return pmap, None break if md.hasFormat('application/octet-stream'): cdata = bytes(md.data('application/octet-stream')) pmap = QPixmap() pmap.loadFromData(cdata) if not pmap.isNull(): return pmap, None if image_exts is None: image_exts = image_extensions() # No image, look for an URL pointing to an image urls = urls_from_md(md) paths = [path_from_qurl(u) for u in urls] # First look for a local file images = [xi for xi in paths if posixpath.splitext(unquote(xi))[1][1:].lower() in image_exts] images = [xi for xi in images if os.path.exists(xi)] p = QPixmap() for path in images: try: with open(path, 'rb') as f: p.loadFromData(f.read()) except Exception: continue if not p.isNull(): return p, None # No local images, look for remote ones # First, see if this is from Firefox rurl, fname = get_firefox_rurl(md, image_exts) if rurl and fname: return rurl, fname # Look through all remaining URLs for remote_url, filename in remote_urls_from_qurl(urls, image_exts): return remote_url, filename return None, None
def download_one(tdir, timeout, progress_report, data_uri_map, url): try: purl = urlparse(url) data_url_key = None with NamedTemporaryFile(dir=tdir, delete=False) as df: if purl.scheme == 'file': path = unquote(purl.path) if iswindows and path.startswith('/'): path = path[1:] src = lopen(path, 'rb') filename = os.path.basename(path) sz = (src.seek(0, os.SEEK_END), src.tell(), src.seek(0))[1] elif purl.scheme == 'data': prefix, payload = purl.path.split(',', 1) parts = prefix.split(';') if parts and parts[-1].lower() == 'base64': payload = re.sub(r'\s+', '', payload) payload = from_base64_bytes(payload) else: payload = payload.encode('utf-8') seen_before = data_uri_map.get(payload) if seen_before is not None: return True, (url, filename, seen_before, guess_type(seen_before)) data_url_key = payload src = BytesIO(payload) sz = len(payload) ext = 'unknown' for x in parts: if '=' not in x and '/' in x: exts = mimetypes.guess_all_extensions(x) if exts: ext = exts[0] break filename = 'data-uri.' + ext else: src = urlopen(url, timeout=timeout) filename = get_filename(purl, src) sz = get_content_length(src) progress_report(url, 0, sz) dest = ProgressTracker(df, url, sz, progress_report) with closing(src): shutil.copyfileobj(src, dest) if data_url_key is not None: data_uri_map[data_url_key] = dest.name filename = sanitize_file_name(filename) mt = guess_type(filename) if mt in OEB_DOCS: raise ValueError( 'The external resource {} looks like a HTML document ({})'. format(url, filename)) if not mt or mt == 'application/octet-stream' or '.' not in filename: raise ValueError( 'The external resource {} is not of a known type'.format( url)) return True, (url, filename, dest.name, mt) except Exception as err: return False, (url, as_unicode(err))
def _cover_from_html(self, hcover): from calibre.ebooks import render_html_svg_workaround with TemporaryDirectory('_html_cover') as tdir: writer = OEBWriter() writer(self.oeb, tdir) path = os.path.join(tdir, unquote(hcover.href)) data = render_html_svg_workaround(path, self.logger) if not data: data = b'' id, href = self.oeb.manifest.generate('cover', 'cover.jpg') item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) return item
def add_links(self): for link in self.links: path, href, frag = link[0] page, rect = link[1:] combined_path = os.path.normcase( os.path.abspath( os.path.join(os.path.dirname(path), *unquote(href).split('/')))) is_local = not href or combined_path in self.anchors annot = Dictionary({ 'Type': Name('Annot'), 'Subtype': Name('Link'), 'Rect': rect, 'Border': Array([0, 0, 0]), }) if self.mark_links: annot.update({ 'Border': Array([16, 16, 1]), 'C': Array([1.0, 0, 0]) }) if is_local: path = combined_path if href else path try: annot['Dest'] = self.anchors[path][frag] except KeyError: try: annot['Dest'] = self.anchors[path][None] except KeyError: pass else: url = href + (('#' + frag) if frag else '') try: purl = urlparse(url) except Exception: self.pdf.debug('Ignoring unparseable URL: %r' % url) continue if purl.scheme and purl.scheme != 'file': action = Dictionary({ 'Type': Name('Action'), 'S': Name('URI'), }) # Do not try to normalize/quote/unquote this URL as if it # has a query part, it will get corrupted action['URI'] = String(url) annot['A'] = action if 'A' in annot or 'Dest' in annot: if 'Annots' not in page: page['Annots'] = Array() page['Annots'].append(self.pdf.objects.add(annot)) else: self.pdf.debug( 'Could not find destination for link: %s in file %s' % (href, path))
def _cover_from_html(self, hcover): from calibre.ebooks import render_html_svg_workaround with TemporaryDirectory('_html_cover') as tdir: writer = OEBWriter() writer(self.oeb, tdir) path = os.path.join(tdir, unquote(hcover.href)) data = render_html_svg_workaround(path, self.logger) if not data: data = '' id, href = self.oeb.manifest.generate('cover', 'cover.jpg') item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) return item
def convert(self, oeb_book, output_path, input_plugin, opts, log): from polyglot.urllib import unquote from lxml import etree self.log, self.opts = log, opts if not os.path.exists(output_path): os.makedirs(output_path) from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES from calibre.ebooks.oeb.normalize_css import condense_sheet with CurrentDir(output_path): results = oeb_book.to_opf2(page_map=True) for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): href, root = results.pop(key, [None, None]) if root is not None: if key == OPF_MIME: try: self.workaround_nook_cover_bug(root) except: self.log.exception( 'Something went wrong while trying to' ' workaround Nook cover bug, ignoring') try: self.workaround_pocketbook_cover_bug(root) except: self.log.exception( 'Something went wrong while trying to' ' workaround Pocketbook cover bug, ignoring') self.migrate_lang_code(root) self.adjust_mime_types(root) raw = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True) if key == OPF_MIME: # Needed as I can't get lxml to output opf:role and # not output <opf:metadata> as well raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw) with lopen(href, 'wb') as f: f.write(raw) for item in oeb_book.manifest: if (not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name): condense_sheet(item.data) path = os.path.abspath(unquote(item.href)) dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) with lopen(path, 'wb') as f: f.write(item.bytes_representation) item.unload_data_from_memory(memory=path)
def get_toc(self): self.stream.seek(24) toc_offset = self.read_i32() self.stream.seek(toc_offset) pages = self.read_i32() toc = RBToc() for i in range(pages): name = unquote(self.stream.read(32).strip(b'\x00')) size, offset, flags = self.read_i32(), self.read_i32(), self.read_i32() toc.append(RBToc.Item(name=name, size=size, offset=offset, flags=flags)) return toc
def get_toc(self): self.stream.seek(24) toc_offset = self.read_i32() self.stream.seek(toc_offset) pages = self.read_i32() toc = RBToc() for i in range(pages): name = unquote(self.stream.read(32).strip(b'\x00')) size, offset, flags = self.read_i32(), self.read_i32(), self.read_i32() toc.append(RBToc.Item(name=name, size=size, offset=offset, flags=flags)) return toc
def read_from_opf(self, opfreader): toc = opfreader.soup.find('spine', toc=True) if toc is not None: toc = toc['toc'] if toc is None: try: toc = opfreader.soup.find('guide').find('reference', attrs={'type': 'toc'})['href'] except: for item in opfreader.manifest: if 'toc' in item.href().lower(): toc = item.href() break if toc is not None: if toc.lower() not in ('ncx', 'ncxtoc'): toc = urlparse(unquote(toc))[2] toc = toc.replace('/', os.sep) if not os.path.isabs(toc): toc = os.path.join(self.base_path, toc) try: if not os.path.exists(toc): bn = os.path.basename(toc) bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files toc = os.path.join(os.path.dirname(toc), bn) self.read_html_toc(toc) except: print( 'WARNING: Could not read Table of Contents. Continuing anyway.' ) else: path = opfreader.manifest.item(toc.lower()) path = getattr(path, 'path', path) if path and os.access(path, os.R_OK): try: self.read_ncx_toc(path) except Exception as err: print('WARNING: Invalid NCX file:', err) return cwd = os.path.abspath(self.base_path) m = glob.glob(os.path.join(cwd, '*.ncx')) if m: toc = m[0] self.read_ncx_toc(toc)
def convert(self, oeb_book, output_path, input_plugin, opts, log): from polyglot.urllib import unquote from lxml import etree self.log, self.opts = log, opts if not os.path.exists(output_path): os.makedirs(output_path) from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES from calibre.ebooks.oeb.normalize_css import condense_sheet with CurrentDir(output_path): results = oeb_book.to_opf2(page_map=True) for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME): href, root = results.pop(key, [None, None]) if root is not None: if key == OPF_MIME: try: self.workaround_nook_cover_bug(root) except: self.log.exception('Something went wrong while trying to' ' workaround Nook cover bug, ignoring') try: self.workaround_pocketbook_cover_bug(root) except: self.log.exception('Something went wrong while trying to' ' workaround Pocketbook cover bug, ignoring') self.migrate_lang_code(root) raw = etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True) if key == OPF_MIME: # Needed as I can't get lxml to output opf:role and # not output <opf:metadata> as well raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw) with open(href, 'wb') as f: f.write(raw) for item in oeb_book.manifest: if ( not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr( item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name): condense_sheet(item.data) path = os.path.abspath(unquote(item.href)) dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) with open(path, 'wb') as f: f.write(item.bytes_representation) item.unload_data_from_memory(memory=path)
def insert_cover(self): from calibre.ebooks.oeb.base import urldefrag g, m = self.oeb.guide, self.oeb.manifest item = None if 'titlepage' not in g: if 'cover' in g: href = g['cover'].href else: href = self.default_cover() if href is None: return width, height = self.inspect_cover(href) if width == -1 or height == -1: self.log.warning('Failed to read cover dimensions') width, height = 600, 800 # if self.preserve_aspect_ratio: # width, height = 600, 800 self.svg_template = self.svg_template.replace( '__viewbox__', '0 0 %d %d' % (width, height)) self.svg_template = self.svg_template.replace( '__width__', unicode_type(width)) self.svg_template = self.svg_template.replace( '__height__', unicode_type(height)) if href is not None: templ = self.non_svg_template if self.no_svg_cover \ else self.svg_template tp = templ % unquote(href) id, href = m.generate('titlepage', 'titlepage.xhtml') item = m.add(id, href, guess_type('t.xhtml')[0], data=etree.fromstring(tp)) else: item = self.oeb.manifest.hrefs[urldefrag( self.oeb.guide['titlepage'].href)[0]] if item is not None: self.oeb.spine.insert(0, item, True) if 'cover' not in self.oeb.guide.refs: self.oeb.guide.add('cover', 'Title Page', 'a') self.oeb.guide.refs['cover'].href = item.href if 'titlepage' in self.oeb.guide.refs: self.oeb.guide.refs['titlepage'].href = item.href titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None) if titem is not None: titem.href = item.href
def parse_html_toc(data): from html5_parser import parse from calibre.utils.cleantext import clean_xml_chars from lxml import etree if isinstance(data, bytes): data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0] root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) for a in root.xpath('//*[@href and local-name()="a"]'): purl = urlparse(unquote(a.get('href'))) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = etree.tostring(a, method='text', encoding='unicode') yield href, fragment, txt
def insert_cover(self): from calibre.ebooks.oeb.base import urldefrag g, m = self.oeb.guide, self.oeb.manifest item = None if 'titlepage' not in g: if 'cover' in g: href = g['cover'].href else: href = self.default_cover() if href is None: return width, height = self.inspect_cover(href) if width == -1 or height == -1: self.log.warning('Failed to read cover dimensions') width, height = 600, 800 # if self.preserve_aspect_ratio: # width, height = 600, 800 self.svg_template = self.svg_template.replace('__viewbox__', '0 0 %d %d'%(width, height)) self.svg_template = self.svg_template.replace('__width__', str(width)) self.svg_template = self.svg_template.replace('__height__', str(height)) if href is not None: templ = self.non_svg_template if self.no_svg_cover \ else self.svg_template tp = templ%unquote(href) id, href = m.generate('titlepage', u'titlepage.xhtml') item = m.add(id, href, guess_type('t.xhtml')[0], data=etree.fromstring(tp)) else: item = self.oeb.manifest.hrefs[ urldefrag(self.oeb.guide['titlepage'].href)[0]] if item is not None: self.oeb.spine.insert(0, item, True) if 'cover' not in self.oeb.guide.refs: self.oeb.guide.add('cover', 'Title Page', 'a') self.oeb.guide.refs['cover'].href = item.href if 'titlepage' in self.oeb.guide.refs: self.oeb.guide.refs['titlepage'].href = item.href titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None) if titem is not None: titem.href = item.href
def build_node(current_node, parent=None): if parent is None: parent = etree.Element('ul') elif len(current_node.nodes): parent = element(parent, ('ul')) for node in current_node.nodes: point = element(parent, 'li') href = relpath(abspath(unquote(node.href)), dirname(ref_url)) if isinstance(href, bytes): href = href.decode('utf-8') link = element(point, 'a', href=clean_xml_chars(href)) title = node.title if isinstance(title, bytes): title = title.decode('utf-8') if title: title = re.sub(r'\s+', ' ', title) link.text = clean_xml_chars(title) build_node(node, point) return parent
def build_node(current_node, parent=None): if parent is None: parent = etree.Element('ul') elif len(current_node.nodes): parent = element(parent, ('ul')) for node in current_node.nodes: point = element(parent, 'li') href = relpath(abspath(unquote(node.href)), dirname(ref_url)) if isinstance(href, bytes): href = href.decode('utf-8') link = element(point, 'a', href=clean_xml_chars(href)) title = node.title if isinstance(title, bytes): title = title.decode('utf-8') if title: title = re.sub(r'\s+', ' ', title) link.text = clean_xml_chars(title) build_node(node, point) return parent
def read_from_opf(self, opfreader): toc = opfreader.soup.find('spine', toc=True) if toc is not None: toc = toc['toc'] if toc is None: try: toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href'] except: for item in opfreader.manifest: if 'toc' in item.href().lower(): toc = item.href() break if toc is not None: if toc.lower() not in ('ncx', 'ncxtoc'): toc = urlparse(unquote(toc))[2] toc = toc.replace('/', os.sep) if not os.path.isabs(toc): toc = os.path.join(self.base_path, toc) try: if not os.path.exists(toc): bn = os.path.basename(toc) bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files toc = os.path.join(os.path.dirname(toc), bn) self.read_html_toc(toc) except: print('WARNING: Could not read Table of Contents. Continuing anyway.') else: path = opfreader.manifest.item(toc.lower()) path = getattr(path, 'path', path) if path and os.access(path, os.R_OK): try: self.read_ncx_toc(path) except Exception as err: print('WARNING: Invalid NCX file:', err) return cwd = os.path.abspath(self.base_path) m = glob.glob(os.path.join(cwd, '*.ncx')) if m: toc = m[0] self.read_ncx_toc(toc)
def _build_manifest(self): states = ['linear', 'nonlinear', 'css', 'images'] manifest = dict((state, []) for state in states) for item in self._oeb.manifest.values(): if item.spine_position is not None: key = 'linear' if item.linear else 'nonlinear' manifest[key].append(item) elif item.media_type in OEB_STYLES: manifest['css'].append(item) elif item.media_type in LIT_IMAGES: manifest['images'].append(item) data = io.BytesIO() data.write(pack('<Bc', 1, b'\\')) offset = 0 for state in states: items = sorted(manifest[state], key=attrgetter('sort_key')) data.write(pack('<I', len(items))) for item in items: id, media_type = item.id, item.media_type if media_type in OEB_DOCS: # Needs to have 'html' in media-type media_type = XHTML_MIME elif media_type in OEB_STYLES: media_type = CSS_MIME href = unquote(item.href) item.offset = offset \ if state in ('linear', 'nonlinear') else 0 data.write(pack('<I', item.offset)) entry = [ codepoint_to_chr(len(id)), unicode_type(id), codepoint_to_chr(len(href)), unicode_type(href), codepoint_to_chr(len(media_type)), unicode_type(media_type) ] for value in entry: data.write(value.encode('utf-8')) data.write(b'\0') offset += item.size self._add_file('/manifest', data.getvalue())
def dnd_get_local_image_and_pixmap(md, image_exts=None): if md.hasImage(): for x in md.formats(): x = unicode_type(x) if x.startswith('image/'): cdata = bytes(md.data(x)) pmap = QPixmap() pmap.loadFromData(cdata) if not pmap.isNull(): return pmap, cdata if md.hasFormat('application/octet-stream'): cdata = bytes(md.data('application/octet-stream')) pmap = QPixmap() pmap.loadFromData(cdata) if not pmap.isNull(): return pmap, cdata if image_exts is None: image_exts = image_extensions() # No image, look for an URL pointing to an image urls = urls_from_md(md) paths = [path_from_qurl(u) for u in urls] # Look for a local file images = [ xi for xi in paths if posixpath.splitext(unquote(xi))[1][1:].lower() in image_exts ] images = [xi for xi in images if os.path.exists(xi)] for path in images: try: with open(path, 'rb') as f: cdata = f.read() except Exception: continue p = QPixmap() p.loadFromData(cdata) if not p.isNull(): return p, cdata return None, None
def rewrite_links(self, url): href, frag = urldefrag(url) try: href = self.current_item.abshref(href) except ValueError: # Unparsable URL return url try: href = urlnormalize(href) except ValueError: # href has non utf-8 quoting return url if href in self.map: anchor_map = self.map[href] nhref = anchor_map[frag if frag else None] nhref = self.current_item.relhref(nhref) if frag: nhref = '#'.join((unquote(nhref), frag)) return nhref return url
def dnd_get_files(md, exts, allow_all_extensions=False, filter_exts=()): ''' Get the file in the QMimeData object md with an extension that is one of the extensions in exts. :return: None, None if no file is found [paths], None if a local file is found [urls], [filenames] if URLs that point to a files are found ''' # Look for a URL pointing to a file urls = urls_from_md(md) # First look for a local file local_files = [path_from_qurl(x) for x in urls] def is_ok(path): ext = posixpath.splitext(path)[1][1:].lower() if allow_all_extensions and ext and ext not in filter_exts: return True return ext in exts and ext not in filter_exts local_files = [p for p in local_files if is_ok(unquote(p))] local_files = [x for x in local_files if os.path.exists(x)] if local_files: return local_files, None # No local files, look for remote ones # First, see if this is from Firefox rurl, fname = get_firefox_rurl(md, exts) if rurl and fname: return [rurl], [fname] # Look through all remaining URLs rurls, filenames = [], [] for rurl, fname in remote_urls_from_qurl(urls, exts): rurls.append(rurl), filenames.append(fname) if rurls: return rurls, filenames return None, None
def dnd_get_files(md, exts, allow_all_extensions=False, filter_exts=()): ''' Get the file in the QMimeData object md with an extension that is one of the extensions in exts. :return: None, None if no file is found [paths], None if a local file is found [urls], [filenames] if URLs that point to a files are found ''' # Look for a URL pointing to a file urls = urls_from_md(md) # First look for a local file local_files = [path_from_qurl(x) for x in urls] def is_ok(path): ext = posixpath.splitext(path)[1][1:].lower() if allow_all_extensions and ext and ext not in filter_exts: return True return ext in exts and ext not in filter_exts local_files = [p for p in local_files if is_ok(unquote(p))] local_files = [x for x in local_files if os.path.exists(x)] if local_files: return local_files, None # No local files, look for remote ones # First, see if this is from Firefox rurl, fname = get_firefox_rurl(md, exts) if rurl and fname: return [rurl], [fname] # Look through all remaining URLs rurls, filenames = [], [] for rurl, fname in remote_urls_from_qurl(urls, exts): rurls.append(rurl), filenames.append(fname) if rurls: return rurls, filenames return None, None
def read_html_toc(self, toc): self.base_path = os.path.dirname(toc) soup = BeautifulSoup(open(toc, 'rb').read(), convertEntities=BeautifulSoup.HTML_ENTITIES) for a in soup.findAll('a'): if not a.has_key('href'): # noqa continue purl = urlparse(unquote(a['href'])) href, fragment = purl[2], purl[5] if not fragment: fragment = None else: fragment = fragment.strip() href = href.strip() txt = ''.join([unicode_type(s).strip() for s in a.findAll(text=True)]) add = True for i in self.flat(): if i.href == href and i.fragment == fragment: add = False break if add: self.add_item(href, fragment, txt)
def __init__(self, title=_('Choose Files'), filters=[], add_all_files_filter=True, parent=None, modal=True, name='', mode=QFileDialog.ExistingFiles, default_dir=u'~', no_save_dir=False, combine_file_and_saved_dir=False ): from calibre.gui2 import dynamic, sanitize_env_vars QObject.__init__(self) ftext = '' if filters: for filter in filters: text, extensions = filter extensions = ['*'+(i if i.startswith('.') else '.'+i) for i in extensions] ftext += '%s (%s);;'%(text, ' '.join(extensions)) if add_all_files_filter or not ftext: ftext += 'All files (*)' if ftext.endswith(';;'): ftext = ftext[:-2] self.dialog_name = dialog_name(name, title) self.selected_files = None self.fd = None if combine_file_and_saved_dir: bn = os.path.basename(default_dir) prev = dynamic.get(self.dialog_name, expanduser(u'~')) if os.path.exists(prev): if os.path.isfile(prev): prev = os.path.dirname(prev) else: prev = expanduser(u'~') initial_dir = os.path.join(prev, bn) elif no_save_dir: initial_dir = expanduser(default_dir) else: initial_dir = dynamic.get(self.dialog_name, expanduser(default_dir)) if not isinstance(initial_dir, string_or_bytes): initial_dir = expanduser(default_dir) if not initial_dir or (not os.path.exists(initial_dir) and not ( mode == QFileDialog.AnyFile and (no_save_dir or combine_file_and_saved_dir))): initial_dir = select_initial_dir(initial_dir) self.selected_files = [] use_native_dialog = 'CALIBRE_NO_NATIVE_FILEDIALOGS' not in os.environ with sanitize_env_vars(): opts = QFileDialog.Option() if not use_native_dialog: opts |= QFileDialog.DontUseNativeDialog if mode == QFileDialog.AnyFile: f = QFileDialog.getSaveFileName(parent, title, initial_dir, ftext, "", opts) if f and f[0]: self.selected_files.append(f[0]) elif mode == QFileDialog.ExistingFile: f = QFileDialog.getOpenFileName(parent, title, initial_dir, ftext, "", opts) if f and f[0] and os.path.exists(f[0]): self.selected_files.append(f[0]) elif mode == QFileDialog.ExistingFiles: fs = QFileDialog.getOpenFileNames(parent, title, initial_dir, ftext, "", opts) if fs and fs[0]: for f in fs[0]: f = unicode_type(f) if not f: continue if not os.path.exists(f): # QFileDialog for some reason quotes spaces # on linux if there is more than one space in a row f = unquote(f) if f and os.path.exists(f): self.selected_files.append(f) else: if mode == QFileDialog.Directory: opts |= QFileDialog.ShowDirsOnly f = unicode_type(QFileDialog.getExistingDirectory(parent, title, initial_dir, opts)) if os.path.exists(f): self.selected_files.append(f) if self.selected_files: self.selected_files = [unicode_type(q) for q in self.selected_files] saved_loc = self.selected_files[0] if os.path.isfile(saved_loc): saved_loc = os.path.dirname(saved_loc) if not no_save_dir: dynamic[self.dialog_name] = saved_loc self.accepted = bool(self.selected_files)
def __init__( self, title=_('Choose Files'), filters=[], add_all_files_filter=True, parent=None, modal=True, name='', mode=QFileDialog.ExistingFiles, default_dir=u'~', no_save_dir=False, combine_file_and_saved_dir=False ): from calibre.gui2 import dynamic, sanitize_env_vars from calibre.gui2.ui import get_gui gui = get_gui() adapt_menubar = gui.bars_manager.adapt_menu_bar_for_dialog if gui is not None else Dummy() QObject.__init__(self) ftext = '' if filters: for filter in filters: text, extensions = filter extensions = ['*'+(i if i.startswith('.') else '.'+i) for i in extensions] ftext += '%s (%s);;'%(text, ' '.join(extensions)) if add_all_files_filter or not ftext: ftext += 'All files (*)' if ftext.endswith(';;'): ftext = ftext[:-2] self.dialog_name = dialog_name(name, title) self.selected_files = None self.fd = None if combine_file_and_saved_dir: bn = os.path.basename(default_dir) prev = dynamic.get(self.dialog_name, os.path.expanduser(u'~')) if os.path.exists(prev): if os.path.isfile(prev): prev = os.path.dirname(prev) else: prev = os.path.expanduser(u'~') initial_dir = os.path.join(prev, bn) elif no_save_dir: initial_dir = os.path.expanduser(default_dir) else: initial_dir = dynamic.get(self.dialog_name, os.path.expanduser(default_dir)) if not isinstance(initial_dir, string_or_bytes): initial_dir = os.path.expanduser(default_dir) if not initial_dir or (not os.path.exists(initial_dir) and not ( mode == QFileDialog.AnyFile and (no_save_dir or combine_file_and_saved_dir))): initial_dir = select_initial_dir(initial_dir) self.selected_files = [] use_native_dialog = 'CALIBRE_NO_NATIVE_FILEDIALOGS' not in os.environ with sanitize_env_vars(): opts = QFileDialog.Option() if not use_native_dialog: opts |= QFileDialog.DontUseNativeDialog if mode == QFileDialog.AnyFile: with adapt_menubar: f = QFileDialog.getSaveFileName(parent, title, initial_dir, ftext, "", opts) if f and f[0]: self.selected_files.append(f[0]) elif mode == QFileDialog.ExistingFile: with adapt_menubar: f = QFileDialog.getOpenFileName(parent, title, initial_dir, ftext, "", opts) if f and f[0] and os.path.exists(f[0]): self.selected_files.append(f[0]) elif mode == QFileDialog.ExistingFiles: with adapt_menubar: fs = QFileDialog.getOpenFileNames(parent, title, initial_dir, ftext, "", opts) if fs and fs[0]: for f in fs[0]: f = unicode_type(f) if not f: continue if not os.path.exists(f): # QFileDialog for some reason quotes spaces # on linux if there is more than one space in a row f = unquote(f) if f and os.path.exists(f): self.selected_files.append(f) else: if mode == QFileDialog.Directory: opts |= QFileDialog.ShowDirsOnly with adapt_menubar: f = unicode_type(QFileDialog.getExistingDirectory(parent, title, initial_dir, opts)) if os.path.exists(f): self.selected_files.append(f) if self.selected_files: self.selected_files = [unicode_type(q) for q in self.selected_files] saved_loc = self.selected_files[0] if os.path.isfile(saved_loc): saved_loc = os.path.dirname(saved_loc) if not no_save_dir: dynamic[self.dialog_name] = saved_loc self.accepted = bool(self.selected_files)
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.utils import zipfile from templite import Templite from polyglot.urllib import unquote from calibre.ebooks.html.meta import EasyMeta # read template files if opts.template_html_index is not None: template_html_index_data = open(opts.template_html_index, 'rb').read() else: template_html_index_data = P('templates/html_export_default_index.tmpl', data=True) if opts.template_html is not None: template_html_data = open(opts.template_html, 'rb').read() else: template_html_data = P('templates/html_export_default.tmpl', data=True) if opts.template_css is not None: template_css_data = open(opts.template_css, 'rb').read() else: template_css_data = P('templates/html_export_default.css', data=True) template_html_index_data = template_html_index_data.decode('utf-8') template_html_data = template_html_data.decode('utf-8') template_css_data = template_css_data.decode('utf-8') self.log = log self.opts = opts meta = EasyMeta(oeb_book.metadata) tempdir = os.path.realpath(PersistentTemporaryDirectory()) output_file = os.path.join(tempdir, basename(re.sub(r'\.zip', '', output_path)+'.html')) output_dir = re.sub(r'\.html', '', output_file)+'_files' if not exists(output_dir): os.makedirs(output_dir) css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css' with open(css_path, 'wb') as f: f.write(template_css_data.encode('utf-8')) with open(output_file, 'wb') as f: html_toc = self.generate_html_toc(oeb_book, output_file, output_dir) templite = Templite(template_html_index_data) nextLink = oeb_book.spine[0].href nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file)) cssLink = relpath(abspath(css_path), dirname(output_file)) tocUrl = relpath(output_file, dirname(output_file)) t = templite.render(has_toc=bool(oeb_book.toc.count()), toc=html_toc, meta=meta, nextLink=nextLink, tocUrl=tocUrl, cssLink=cssLink, firstContentPageLink=nextLink) if isinstance(t, unicode_type): t = t.encode('utf-8') f.write(t) with CurrentDir(output_dir): for item in oeb_book.manifest: path = abspath(unquote(item.href)) dir = dirname(path) if not exists(dir): os.makedirs(dir) if item.spine_position is not None: with open(path, 'wb') as f: pass else: with open(path, 'wb') as f: f.write(item.bytes_representation) item.unload_data_from_memory(memory=path) for item in oeb_book.spine: path = abspath(unquote(item.href)) dir = dirname(path) root = item.data.getroottree() # get & clean HTML <HEAD>-data head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] head_content = etree.tostring(head, pretty_print=True, encoding='unicode') head_content = re.sub(r'\<\/?head.*\>', '', head_content) head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content) head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content) # get & clean HTML <BODY>-data body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode') ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content) ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content) # generate link to next page if item.spine_position+1 < len(oeb_book.spine): nextLink = oeb_book.spine[item.spine_position+1].href nextLink = relpath(abspath(nextLink), dir) else: nextLink = None # generate link to previous page if item.spine_position > 0: prevLink = oeb_book.spine[item.spine_position-1].href prevLink = relpath(abspath(prevLink), dir) else: prevLink = None cssLink = relpath(abspath(css_path), dir) tocUrl = relpath(output_file, dir) firstContentPageLink = oeb_book.spine[0].href # render template templite = Templite(template_html_data) toc = lambda: self.generate_html_toc(oeb_book, path, output_dir) t = templite.render(ebookContent=ebook_content, prevLink=prevLink, nextLink=nextLink, has_toc=bool(oeb_book.toc.count()), toc=toc, tocUrl=tocUrl, head_content=head_content, meta=meta, cssLink=cssLink, firstContentPageLink=firstContentPageLink) # write html to file with open(path, 'wb') as f: f.write(t.encode('utf-8')) item.unload_data_from_memory(memory=path) zfile = zipfile.ZipFile(output_path, "w") zfile.add_dir(output_dir, basename(output_dir)) zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED) if opts.extract_to: if os.path.exists(opts.extract_to): shutil.rmtree(opts.extract_to) os.makedirs(opts.extract_to) zfile.extractall(opts.extract_to) self.log('Zip file extracted to', opts.extract_to) zfile.close() # cleanup temp dir shutil.rmtree(tempdir)
def convert(self, oeb_book, output_path, input_plugin, opts, log): from lxml import etree from calibre.utils import zipfile from templite import Templite from polyglot.urllib import unquote from calibre.ebooks.html.meta import EasyMeta # read template files if opts.template_html_index is not None: template_html_index_data = open(opts.template_html_index, 'rb').read() else: template_html_index_data = P( 'templates/html_export_default_index.tmpl', data=True) if opts.template_html is not None: template_html_data = open(opts.template_html, 'rb').read() else: template_html_data = P('templates/html_export_default.tmpl', data=True) if opts.template_css is not None: template_css_data = open(opts.template_css, 'rb').read() else: template_css_data = P('templates/html_export_default.css', data=True) template_html_index_data = template_html_index_data.decode('utf-8') template_html_data = template_html_data.decode('utf-8') template_css_data = template_css_data.decode('utf-8') self.log = log self.opts = opts meta = EasyMeta(oeb_book.metadata) tempdir = os.path.realpath(PersistentTemporaryDirectory()) output_file = os.path.join( tempdir, basename(re.sub(r'\.zip', '', output_path) + '.html')) output_dir = re.sub(r'\.html', '', output_file) + '_files' if not exists(output_dir): os.makedirs(output_dir) css_path = output_dir + os.sep + 'calibreHtmlOutBasicCss.css' with open(css_path, 'wb') as f: f.write(template_css_data.encode('utf-8')) with open(output_file, 'wb') as f: html_toc = self.generate_html_toc(oeb_book, output_file, output_dir) templite = Templite(template_html_index_data) nextLink = oeb_book.spine[0].href nextLink = relpath(output_dir + os.sep + nextLink, dirname(output_file)) cssLink = relpath(abspath(css_path), dirname(output_file)) tocUrl = relpath(output_file, dirname(output_file)) t = templite.render(has_toc=bool(oeb_book.toc.count()), toc=html_toc, meta=meta, nextLink=nextLink, tocUrl=tocUrl, cssLink=cssLink, firstContentPageLink=nextLink) if isinstance(t, unicode_type): t = t.encode('utf-8') f.write(t) with CurrentDir(output_dir): for item in oeb_book.manifest: path = abspath(unquote(item.href)) dir = dirname(path) if not exists(dir): os.makedirs(dir) if item.spine_position is not None: with open(path, 'wb') as f: pass else: with open(path, 'wb') as f: f.write(str(item)) item.unload_data_from_memory(memory=path) for item in oeb_book.spine: path = abspath(unquote(item.href)) dir = dirname(path) root = item.data.getroottree() # get & clean HTML <HEAD>-data head = root.xpath( '//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] head_content = etree.tostring(head, pretty_print=True, encoding='utf-8') head_content = re.sub(r'\<\/?head.*\>', '', head_content) head_content = re.sub( re.compile(r'\<style.*\/style\>', re.M | re.S), '', head_content) head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content) # get & clean HTML <BODY>-data body = root.xpath( '//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0] ebook_content = etree.tostring(body, pretty_print=True, encoding='utf-8') ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content) ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content) # generate link to next page if item.spine_position + 1 < len(oeb_book.spine): nextLink = oeb_book.spine[item.spine_position + 1].href nextLink = relpath(abspath(nextLink), dir) else: nextLink = None # generate link to previous page if item.spine_position > 0: prevLink = oeb_book.spine[item.spine_position - 1].href prevLink = relpath(abspath(prevLink), dir) else: prevLink = None cssLink = relpath(abspath(css_path), dir) tocUrl = relpath(output_file, dir) firstContentPageLink = oeb_book.spine[0].href # render template templite = Templite(template_html_data) toc = lambda: self.generate_html_toc(oeb_book, path, output_dir ) t = templite.render(ebookContent=ebook_content, prevLink=prevLink, nextLink=nextLink, has_toc=bool(oeb_book.toc.count()), toc=toc, tocUrl=tocUrl, head_content=head_content, meta=meta, cssLink=cssLink, firstContentPageLink=firstContentPageLink) # write html to file with open(path, 'wb') as f: f.write(t) item.unload_data_from_memory(memory=path) zfile = zipfile.ZipFile(output_path, "w") zfile.add_dir(output_dir, basename(output_dir)) zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED) if opts.extract_to: if os.path.exists(opts.extract_to): shutil.rmtree(opts.extract_to) os.makedirs(opts.extract_to) zfile.extractall(opts.extract_to) self.log('Zip file extracted to', opts.extract_to) zfile.close() # cleanup temp dir shutil.rmtree(tempdir)
def add_links(self): for link in self.links: path, href, frag = link[0] page, rect = link[1:] combined_path = os.path.normcase(os.path.abspath(os.path.join(os.path.dirname(path), *unquote(href).split('/')))) is_local = not href or combined_path in self.anchors annot = Dictionary({ 'Type':Name('Annot'), 'Subtype':Name('Link'), 'Rect':rect, 'Border':Array([0,0,0]), }) if self.mark_links: annot.update({'Border':Array([16, 16, 1]), 'C':Array([1.0, 0, 0])}) if is_local: path = combined_path if href else path try: annot['Dest'] = self.anchors[path][frag] except KeyError: try: annot['Dest'] = self.anchors[path][None] except KeyError: pass else: url = href + (('#'+frag) if frag else '') try: purl = urlparse(url) except Exception: self.pdf.debug('Ignoring unparseable URL: %r' % url) continue if purl.scheme and purl.scheme != 'file': action = Dictionary({ 'Type':Name('Action'), 'S':Name('URI'), }) # Do not try to normalize/quote/unquote this URL as if it # has a query part, it will get corrupted action['URI'] = String(url) annot['A'] = action if 'A' in annot or 'Dest' in annot: if 'Annots' not in page: page['Annots'] = Array() page['Annots'].append(self.pdf.objects.add(annot)) else: self.pdf.debug('Could not find destination for link: %s in file %s'% (href, path))