def parse_meta_tags(src): rmap = {} for field, names in iteritems(META_NAMES): for name in names: rmap[name.lower()] = field all_names = '|'.join(rmap) ans = {} npat = r'''name\s*=\s*['"]{0,1}(?P<name>%s)['"]{0,1}''' % all_names cpat = r'content\s*=\s*%s' % attr_pat for pat in ( r'<meta\s+%s\s+%s' % (npat, cpat), r'<meta\s+%s\s+%s' % (cpat, npat), ): for match in re.finditer(pat, src, flags=re.IGNORECASE): x = match.group('name').lower() try: field = rmap[x] except KeyError: try: field = rmap[x.replace(':', '.')] except KeyError: continue if field not in ans: ans[field] = replace_entities(match.group('content')) if len(ans) == len(META_NAMES): return ans return ans
def all_links(html): ''' Return set of all links in the file ''' ans = set() for match in re.finditer( r'''<\s*[Aa]\s+.*?[hH][Rr][Ee][Ff]\s*=\s*(['"])(.+?)\1''', html, re.MULTILINE|re.DOTALL): ans.add(replace_entities(match.group(2))) return ans
def parse_meta_tags(src): rmap = {} for field, names in META_NAMES.iteritems(): for name in names: rmap[name.lower()] = field all_names = "|".join(rmap) ans = {} npat = r"""name\s*=\s*['"]{0,1}(?P<name>%s)['"]{0,1}""" % all_names cpat = "content\s*=\s*%s" % attr_pat for pat in ("<meta\s+%s\s+%s" % (npat, cpat), "<meta\s+%s\s+%s" % (cpat, npat)): for match in re.finditer(pat, src, flags=re.IGNORECASE): x = match.group("name").lower() try: field = rmap[x] except KeyError: try: field = rmap[x.replace(":", ".")] except KeyError: continue if field not in ans: ans[field] = replace_entities(match.group("content")) if len(ans) == len(META_NAMES): return ans return ans
def __init__(self, raw, codec, title): self.doctype = raw[:4] self.length, self.num_items = struct.unpack('>LL', raw[4:12]) raw = raw[12:] pos = 0 self.mi = MetaInformation(_('Unknown'), [_('Unknown')]) self.has_fake_cover = True self.start_offset = None left = self.num_items self.kf8_header = None self.uuid = self.cdetype = None while left > 0: left -= 1 idx, size = struct.unpack('>LL', raw[pos:pos + 8]) content = raw[pos + 8:pos + size] pos += size if idx >= 100 and idx < 200: self.process_metadata(idx, content, codec) elif idx == 203: self.has_fake_cover = bool(struct.unpack('>L', content)[0]) elif idx == 201: co, = struct.unpack('>L', content) if co < NULL_INDEX: self.cover_offset = co elif idx == 202: self.thumbnail_offset, = struct.unpack('>L', content) elif idx == 501: try: self.cdetype = content.decode('ascii') except UnicodeDecodeError: self.cdetype = None # cdetype if content == b'EBSP': if not self.mi.tags: self.mi.tags = [] self.mi.tags.append(_('Sample Book')) elif idx == 502: # last update time pass elif idx == 503: # Long title # Amazon seems to regard this as the definitive book title # rather than the title from the PDB header. In fact when # sending MOBI files through Amazon's email service if the # title contains non ASCII chars or non filename safe chars # they are messed up in the PDB header try: title = content.decode(codec) except: pass #else: # print 'unknown record', idx, repr(content) if title: self.mi.title = replace_entities(title)
def parse_comment_tags(src): all_names = '|'.join(itervalues(COMMENT_NAMES)) rmap = {v:k for k, v in iteritems(COMMENT_NAMES)} ans = {} for match in re.finditer(r'''<!--\s*(?P<name>%s)\s*=\s*%s''' % (all_names, attr_pat), src): field = rmap[match.group('name')] if field not in ans: ans[field] = replace_entities(match.group('content')) if len(ans) == len(COMMENT_NAMES): break return ans
def add_words_from_escaped_html(text, words, file_name, node, attr, locale): text = replace_entities(text) root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8')) ewords = defaultdict(list) ewords[None] = 0 read_words_from_html(root, ewords, file_name, locale) words[None] += ewords.pop(None) for k, locs in iteritems(ewords): for loc in locs: loc.location_node, loc.node_item = node, (False, attr) words[k].extend(locs)
def parse_comment_tags(src): all_names = "|".join(COMMENT_NAMES.itervalues()) rmap = {v: k for k, v in COMMENT_NAMES.iteritems()} ans = {} for match in re.finditer(r"""<!--\s*(?P<name>%s)\s*=\s*%s""" % (all_names, attr_pat), src): field = rmap[match.group("name")] if field not in ans: ans[field] = replace_entities(match.group("content")) if len(ans) == len(COMMENT_NAMES): break return ans
def handle_comment(data, comment_tags): if not hasattr(handle_comment, 'pat'): handle_comment.pat = re.compile(r'''(?P<name>\S+)\s*=\s*%s''' % attr_pat) for match in handle_comment.pat.finditer(data): x = match.group('name') field = None try: field = rmap_comment[x] except KeyError: pass if field: comment_tags[field].append(replace_entities( match.group('content')))
def find_links(self, src): for match in self.LINK_PAT.finditer(src): url = None for i in ('url1', 'url2', 'url3'): url = match.group(i) if url: break url = replace_entities(url) try: link = self.resolve(url) except ValueError: # Unparseable URL, ignore continue if link not in self.links: self.links.append(link)
def process_node(node, html_parent): ntype = node.get('type') if ntype == 'tag': c = html_parent.makeelement(node['name']) c.attrib.update(node.get('attribs', {})) html_parent.append(c) for nc in node.get('children', ()): process_node(nc, c) elif ntype == 'text': text = node.get('data') if text: text = replace_entities(text) if len(html_parent): t = html_parent[-1] t.tail = (t.tail or '') + text else: html_parent.text = (html_parent.text or '') + text
def build_toc(index_entries): ans = TOC(base_path=os.getcwdu()) levels = {x['hlvl'] for x in index_entries} num_map = {-1: ans} level_map = {l:[x for x in index_entries if x['hlvl'] == l] for l in levels} for lvl in sorted(levels): for item in level_map[lvl]: parent = num_map[item['parent']] child = parent.add_item(item['href'], item['idtag'], replace_entities(item['text'], encoding=None)) num_map[item['num']] = child # Set play orders in depth first order for i, item in enumerate(ans.flat()): item.play_order = i return ans
def handle_entities(text, func): return func(replace_entities(text))
def count_chars_in_escaped_html(text, counter, file_name, node, attr, locale): text = replace_entities(text) root = parse('<html><body><div>%s</div></body></html>' % text, decoder=lambda x:x.decode('utf-8')) count_chars_in_html(root, counter, file_name, locale)
def handle_entities(text, func): return prepare_string_for_xml(func(replace_entities(text)))
def read_simple_property(elem): # A simple property if elem.text: return replace_entities(elem.text) return replace_entities(elem.get(expand('rdf:resource'), ''))
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags = parse_comment_tags(src) meta_tags = parse_meta_tags(src) def get(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = ans.strip() if not ans: ans = None return ans # Title title = get('title') if not title: pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE) match = pat.search(src) if match: title = replace_entities(match.group(1)) # Author authors = get('authors') or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title or _('Unknown'), string_to_authors(authors)) for field in ('publisher', 'isbn', 'language', 'comments'): val = get(field) if val: setattr(mi, field, val) for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 5: mi.rating /= 2. if mi.rating > 5: mi.rating = 0 except: pass # TAGS tags = get('tags') if tags: tags = [x.strip() for x in tags.split(',') if x.strip()] if tags: mi.tags = tags return mi
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, 'src.pdf') index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html')) with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): def a(x): return os.path.basename(x) exe = PDFTOHTML cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', a(pdfsrc), a(index)] if isbsd: cmd.remove('-nodrm') if no_images: cmd.append('-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError( _('Could not find pdftohtml, check it is in your PATH')) else: raise ret = eintr_retry_call(p.wait) logf.flush() logf.close() out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip() if ret != 0: raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out)) if out: prints("pdftohtml log:") prints(out) if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with lopen(index, 'r+b') as i: raw = i.read().decode('utf-8') raw = flip_images(raw) raw = raw.replace('<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1) i.seek(0) i.truncate() # versions of pdftohtml >= 0.20 output self closing <br> tags, this # breaks the pdf heuristics regexps, so replace them raw = raw.replace('<br/>', '<br>') raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = replace_entities(raw) raw = raw.replace('\u00a0', ' ') i.write(raw.encode('utf-8')) cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout', a(pdfsrc)] if isbsd: cmd.remove('-nodrm') p = popen(cmd, stdout=subprocess.PIPE) raw = p.stdout.read().strip() if p.wait() == 0 and raw: parse_outline(raw, output_dir) try: os.remove(pdfsrc) except: pass
def get_book_details(self, log, metadata, timeout, cachedPage): # {{{ from lxml import etree, html from calibre.ebooks.chardet import xml_to_unicode if not cachedPage: url = self.get_book_url(metadata.get_identifiers())[2] # log.debug(u'book_details_url', url) raw = self.browser.open_novisit(url, timeout=timeout).read() fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) else: fulldoc = cachedPage log.debug(u'book_details -> using cached page') fullString = etree.tostring(fulldoc) doc = fulldoc.xpath(u'//div[@class="bDetailPage"][1]')[0] # series Серия/Серии series_elem = doc.xpath(u'//div[contains(text(), "Сери")]') if series_elem: series_text_elem = series_elem[0].getnext() metadata.series = series_text_elem.xpath(u'.//a/text()')[0] log.debug(u'**Seria: ', metadata.series) isbn = None isbn_elem = doc.xpath(u'//div[contains(text(), "ISBN")]') if isbn_elem: isbn = isbn_elem[0].getnext().xpath(u'normalize-space(./text())') metadata.identifiers['isbn'] = isbn # get authors/editors if no authors are available authors_joined = ','.join(metadata.authors) if authors_joined == '' or authors_joined == "Unknown": authors_from_detail = [] editor_elem = doc.xpath(u'//div[contains(text(), "Редактор")]') if editor_elem: editor = editor_elem[0].getnext().xpath(u'.//a/text()')[0] authors_from_detail.append(editor + u' (ред.)') authors_elem = doc.xpath(u'//div[contains(text(), "Автор")]') if authors_elem: authors = authors_elem[0].getnext().xpath( u'.//a/text()') # list authors_from_detail.extend(authors) if len(authors_from_detail) > 0: metadata.authors = authors_from_detail cover = doc.xpath('.//img[contains(@class, "fullImage")]/@src')[0] metadata.ozon_cover_url = _translateToBigCoverUrl(cover) publishers = None publishers_elem = doc.xpath(u'//div[contains(text(), "Издатель")]') if publishers_elem: publishers_elem = publishers_elem[0].getnext() publishers = publishers_elem.xpath(u'.//a/text()')[0] if publishers: metadata.publisher = publishers displ_lang = None langs = None langs_elem = doc.xpath(u'//div[contains(text(), "зык")]') if langs_elem: langs_elem = langs_elem[0].getnext() langs = langs_elem.xpath(u'text()')[0].strip() if langs: lng_splt = langs.split(u',') if lng_splt: displ_lang = lng_splt[0].strip() # log.debug(u'displ_lang1: ', displ_lang) metadata.language = _translageLanguageToCode(displ_lang) # log.debug(u'Language: ', metadata.language) # can be set before from xml search response if not metadata.pubdate: pubdate_elem = doc.xpath(u'//div[contains(text(), "Год выпуска")]') if pubdate_elem: pubYear = pubdate_elem[0].getnext().xpath(u'text()')[0].strip() if pubYear: matcher = re.search(r'\d{4}', pubYear) if matcher: metadata.pubdate = toPubdate(log, matcher.group(0)) # log.debug(u'Pubdate: ', metadata.pubdate) # comments, from Javascript data beginning = fullString.find(u'FirstBlock') end = fullString.find(u'}', beginning) comments = unicode(fullString[beginning + 75:end - 1]).decode("unicode-escape") metadata.comments = replace_entities(comments, 'utf-8')
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, 'src.pdf') index = os.path.join(output_dir, 'index.' + ('xml' if as_xml else 'html')) with lopen(pdf_path, 'rb') as src, lopen(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): def a(x): return os.path.basename(x) exe = PDFTOHTML cmd = [ exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', a(pdfsrc), a(index) ] if isbsd: cmd.remove('-nodrm') if no_images: cmd.append('-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError( _('Could not find pdftohtml, check it is in your PATH')) else: raise ret = eintr_retry_call(p.wait) logf.flush() logf.close() out = lopen(logf.name, 'rb').read().decode('utf-8', 'replace').strip() if ret != 0: raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out)) if out: prints("pdftohtml log:") prints(out) if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with lopen(index, 'r+b') as i: raw = i.read().decode('utf-8') raw = flip_images(raw) raw = raw.replace( '<head', '<!-- created by calibre\'s pdftohtml -->\n <head', 1) i.seek(0) i.truncate() # versions of pdftohtml >= 0.20 output self closing <br> tags, this # breaks the pdf heuristics regexps, so replace them raw = raw.replace('<br/>', '<br>') raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw, flags=re.I) raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw, flags=re.I) raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"', raw, flags=re.I) raw = replace_entities(raw) raw = raw.replace('\u00a0', ' ') i.write(raw.encode('utf-8')) cmd = [ exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout', a(pdfsrc) ] if isbsd: cmd.remove('-nodrm') p = popen(cmd, stdout=subprocess.PIPE) raw = p.stdout.read().strip() if p.wait() == 0 and raw: parse_outline(raw, output_dir) try: os.remove(pdfsrc) except: pass
def get_book_details(self, log, metadata, timeout, cachedPage): # {{{ from lxml import etree, html from calibre.ebooks.chardet import xml_to_unicode if not cachedPage: url = self.get_book_url(metadata.get_identifiers())[2] # log.debug(u'book_details_url', url) raw = self.browser.open_novisit(url, timeout=timeout).read() fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) else: fulldoc = cachedPage log.debug(u'book_details -> using cached page') fullString = etree.tostring(fulldoc) doc = fulldoc.xpath(u'//div[@class="bDetailPage"][1]')[0] # series Серия/Серии series_elem = doc.xpath(u'//div[contains(text(), "Сери")]') if series_elem: series_text_elem = series_elem[0].getnext() metadata.series = series_text_elem.xpath(u'.//a/text()')[0] log.debug(u'**Seria: ', metadata.series) isbn = None isbn_elem = doc.xpath(u'//div[contains(text(), "ISBN")]') if isbn_elem: isbn = isbn_elem[0].getnext().xpath(u'normalize-space(./text())') metadata.identifiers['isbn'] = isbn # get authors/editors if no authors are available authors_joined = ','.join(metadata.authors) if authors_joined == '' or authors_joined == "Unknown": authors_from_detail = [] editor_elem = doc.xpath(u'//div[contains(text(), "Редактор")]') if editor_elem: editor = editor_elem[0].getnext().xpath(u'.//a/text()')[0] authors_from_detail.append(editor + u' (ред.)') authors_elem = doc.xpath(u'//div[contains(text(), "Автор")]') if authors_elem: authors = authors_elem[0].getnext().xpath(u'.//a/text()') # list authors_from_detail.extend(authors) if len(authors_from_detail) > 0: metadata.authors = authors_from_detail cover = doc.xpath('.//img[contains(@class, "fullImage")]/@src')[0] metadata.ozon_cover_url = _translateToBigCoverUrl(cover) publishers = None publishers_elem = doc.xpath(u'//div[contains(text(), "Издатель")]') if publishers_elem: publishers_elem = publishers_elem[0].getnext() publishers = publishers_elem.xpath(u'.//a/text()')[0] if publishers: metadata.publisher = publishers displ_lang = None langs = None langs_elem = doc.xpath(u'//div[contains(text(), "зык")]') if langs_elem: langs_elem = langs_elem[0].getnext() langs = langs_elem.xpath(u'text()')[0].strip() if langs_elem else None if langs: lng_splt = langs.split(u',') if lng_splt: displ_lang = lng_splt[0].strip() # log.debug(u'displ_lang1: ', displ_lang) metadata.language = _translageLanguageToCode(displ_lang) # log.debug(u'Language: ', metadata.language) # can be set before from xml search response if not metadata.pubdate: pubdate_elem = doc.xpath(u'//div[contains(text(), "Год выпуска")]') if pubdate_elem: pubYear = pubdate_elem[0].getnext().xpath(u'text()')[0].strip() if pubYear: matcher = re.search(r'\d{4}', pubYear) if matcher: metadata.pubdate = toPubdate(log, matcher.group(0)) # log.debug(u'Pubdate: ', metadata.pubdate) # comments, from Javascript data beginning = fullString.find(u'FirstBlock') end = fullString.find(u'}', beginning) comments = unicode(fullString[beginning + 75:end - 1]).decode("unicode-escape") metadata.comments = replace_entities(comments, 'utf-8')