class KindleHighResCovers(Source): name = 'Kindle hi-res covers' description = 'Downloads high resolution covers for Kindle editions from Amazon' capabilities = frozenset(['cover']) author = 'Leonardo Brondani Schenkel <*****@*****.**>' version = (0, 4, 0) can_get_multiple_covers = True KEY_MAX_COVERS = 'max_covers' options = (Option( KEY_MAX_COVERS, 'number', 2, _('Maximum number of covers to get'), _('The maximum number of covers to get from amazon.com (since we try to get the covers from 2 sources, you might end up with two versions of each retrieved cover).' )), ) def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=60, get_best_cover=False): urls = get_cover_urls(log, title, authors, identifiers, timeout) log.info('Cover URLs: ' + repr(urls)) if urls: log.info('Create link to download cover') self.download_multiple_covers(title, authors, urls, False, timeout, result_queue, abort, log, None)
class BigBookSearch(Source): name = 'Big Book Search' description = _( 'Downloads multiple book covers from Amazon. Useful to find alternate covers.' ) capabilities = frozenset(['cover']) config_help_message = _('Configure the Big Book Search plugin') can_get_multiple_covers = True options = (Option( 'max_covers', 'number', 5, _('Maximum number of covers to get'), _('The maximum number of covers to process from the search result')), ) supports_gzip_transfer_encoding = True def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): if not title: return br = self.browser tokens = tuple(self.get_title_tokens(title)) + tuple( self.get_author_tokens(authors)) urls = get_urls(br, tokens) self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)
class GoogleImages(Source): name = 'Google Images' description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.') capabilities = frozenset(['cover']) config_help_message = _('Configure the Google Image Search plugin') can_get_multiple_covers = True options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'), _('The maximum number of covers to process from the google search result')), Option('size', 'choices', 'svga', _('Cover size'), _('Search for covers larger than the specified size'), choices=OrderedDict(( ('any', _('Any size'),), ('l', _('Large'),), ('qsvga', _('Larger than %s')%'400x300',), ('vga', _('Larger than %s')%'640x480',), ('svga', _('Larger than %s')%'600x800',), ('xga', _('Larger than %s')%'1024x768',), ('2mp', _('Larger than %s')%'2 MP',), ('4mp', _('Larger than %s')%'4 MP',), ))), ) def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): if not title: return timeout = max(60, timeout) # Needs at least a minute title = ' '.join(self.get_title_tokens(title)) author = ' '.join(self.get_author_tokens(authors)) urls = self.get_image_urls(title, author, log, abort, timeout) self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log) def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.ipc.simple_worker import fork_job, WorkerError try: return fork_job('calibre.ebooks.metadata.sources.google_images', 'search', args=(title, author, self.prefs['size'], timeout), no_output=True, abort=abort, timeout=timeout)['result'] except WorkerError as e: if e.orig_tb: log.error(e.orig_tb) log.exception('Searching google failed:' + as_unicode(e)) except Exception as e: log.exception('Searching google failed:' + as_unicode(e)) return []
class Douban(Source): name = "Douban Book" author = "Li Fanxi, xcffl, jnozsc, else" version = (4, 0, 1) minimum_calibre_version = (5, 0, 0) description = ("Downloads metadata and covers from Douban.com. " "Useful only for Chinese language books.") capabilities = frozenset(["identify", "cover"]) touched_fields = frozenset([ "title", "authors", "tags", "pubdate", "comments", "publisher", "identifier:isbn", "rating", "identifier:douban", ]) # language currently disabled supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = True ISBN_URL = "http://douban.com/isbn/" SUBJECT_URL = "http://book.douban.com/subject/" DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/' options = (Option( "include_subtitle_in_title", "bool", True, ("Include subtitle in book title:"), ("Whether to append subtitle in the book title."), ), ) def identify( self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, ): import json time.sleep(random.randint(1, 3)) log.info("start get metadata from douban...") log.info(str(identifiers)) # query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) book = self.get_book(log, identifiers) # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(log, book, abort, result_queue, timeout) return None def to_metadata(self, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow log.info("to_metadata") douban_id = entry_.get("id") title = entry_.get("title") description = entry_.get("summary") # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get("publisher") isbn = entry_.get("isbn13") # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get("pubdate") authors = entry_.get("author") # authors = "author" book_tags = entry_.get("tags") rating = entry_.get("rating") cover_url = entry_.get("cover") series = entry_.get("series") if not authors: authors = [("Unknown")] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {"douban": douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(""), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = book_tags # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except BaseException: log.error("Failed to parse pubdate %r" % pubdate) if rating: try: # mi.publisher += "#PrB.rating#" + str(rating) mi.rating = rating / 2.0 except BaseException: log.exception("Failed to parse rating") mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find("book-default") == -1: mi.has_douban_cover = u # Series if series: mi.series = series return mi # }}} def get_isbn_url(self, isbn): # {{{ if isbn is not None: return self.ISBN_URL + isbn else: return "" # }}} def get_douban_url(self, identifiers): isbn = self.get_book_isbn(identifiers) url = self.get_isbn_url(isbn) if url: return url tup = self.get_book_url(identifiers) if tup: return tup[2] def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: return ('douban', db, self.DOUBAN_BOOK_URL % db) # }}} def get_book_isbn(self, identifiers): isbn = check_isbn(identifiers.get("isbn", None)) return isbn def download_cover( self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False, ): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info("No cached cover found, running identify") rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info("No cover found") return if abort.is_set(): return br = self.browser log("Downloading cover from:", cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() if cdata: result_queue.put((self, cdata)) except BaseException: log.exception("Failed to download cover from:", cached_url) # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None db = identifiers.get("douban", None) if db is None: isbn = identifiers.get("isbn", None) if isbn is not None: db = self.cached_isbn_to_identifier(isbn) if db is not None: url = self.cached_identifier_to_cover_url(db) return url # }}} def get_all_details(self, log, book, abort, result_queue, timeout): # {{{ try: log.info("get_all_details") ans = self.to_metadata(log, book, timeout) if isinstance(ans, Metadata): ans.source_relevance = 0 douban_id = ans.identifiers["douban"] isbn = book.get("isbn13") self.cache_isbn_to_identifier(isbn, douban_id) if ans.has_douban_cover: self.cache_identifier_to_cover_url(douban_id, ans.has_douban_cover) self.clean_downloaded_metadata(ans) result_queue.put(ans) except BaseException: log.exception("Failed to get metadata for identify entry:", book["id"]) if abort.is_set(): return # }}} def get_book(self, log, identifiers={}): log.info("start get book......") url = self.get_douban_url(identifiers) html = self.__get_html(url) if html == -1: # log.info("book not found: " + isbn) return -1 soup = self.__get_soup(html=html) infos = self.__get_infos(soup=soup) isbn = self.__get_isbn(log, identifiers, soup, infos) book = {"isbn13": isbn} book["author"] = self.__get_authors(infos) book["publisher"] = self.__get_info(infos, "出版社:") book["pubdate"] = self.__get_info(infos, "出版年:") book["series"] = self.__get_info(infos, "丛书:") book["id"] = self.__get_id(soup=soup) book["tags"] = self.__get_tags(soup=soup) book["rating"] = self.__get_score(soup=soup) book["title"] = self.__get_title(soup=soup) book["summary"] = self.__get_intro(soup=soup) book["cover"] = self.__get_cover(soup=soup) return book def __get_html(self, url): headers_ = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } request = urllib.request.Request(url, headers=headers_) try: response = urllib.request.urlopen(request) except BaseException: return -1 html = response.read().decode("utf-8") return html def __get_soup(self, html=""): soup = BeautifulSoup(html, "lxml", exclude_encodings="utf-8") return soup def __get_infos(self, soup): soupSelect = str(soup.select("#info")) soupTemp = BeautifulSoup(str(soupSelect), "lxml", exclude_encodings="utf-8") infosTemp = soupTemp.text.splitlines() infos = [] for info in infosTemp: tmp = info.strip() if tmp and tmp != "/": infos.append(tmp) infos.remove("[") infos.remove("]") return infos def __get_info(self, infos, name): for token in infos: if token.find(name) != -1: return token[len(name) + 1:] return "" def __get_authors(self, infos): begin = -1 end = -1 i = 0 for token in infos: if token == "作者:": begin = i elif token.find("出版社:") != -1: end = i + 1 break else: i = i + 1 authors = [] if begin == -1: return authors if end == -1: authors.append(infos[begin + 1]) return authors else: for i in range(begin + 1, end): author = infos[i].strip() author = author.replace("【", "[") author = author.replace("】", "]") author = author.replace("(", "[") author = author.replace(")", "]") author = author.replace("〔", "[") author = author.replace("〕", "]") author = author.replace("(", "[") author = author.replace(")", "]") author = author.replace("]", "] ") author = author.replace("•", "·") author = author.replace("・", "·") authors.append(author) return authors def __get_id(self, soup): idSelects = str(soup.select("meta")).split() for item in idSelects: idIndex = item.find("douban.com/book/subject/") if idIndex != -1: id = item[idIndex + 24:-2] return id return 0 def __get_tags(self, soup): tagSelect = str(soup.select("#db-tags-section > div")) tagTemp = BeautifulSoup(str(tagSelect), "lxml", exclude_encodings="utf-8") tagText = tagTemp.text tags = tagText.split() tags.remove("[") tags.remove("]") return tags def __get_cover(self, soup): coverSelect = str(soup.select("#mainpic > a > img")) tempCover = str( BeautifulSoup(str(coverSelect), "lxml", exclude_encodings="utf-8")) index1 = tempCover.find("src=") tempCover = tempCover[index1 + 5:] index2 = tempCover.find('"') tempCover = tempCover[:index2] return tempCover def __get_score(self, soup): soupSelect = str( soup.select( "#interest_sectl > div > div.rating_self.clearfix > strong")) soupTemp = BeautifulSoup(str(soupSelect), "lxml", exclude_encodings="utf-8") score = soupTemp.text.strip("[] \n\t") if score: s = float(score) return s else: return 0.0 def __get_title(self, soup): soupSelect = str(soup.select("body>div>h1>span")) soupTemp = BeautifulSoup(str(soupSelect), "lxml", exclude_encodings="utf-8") return str(soupTemp.text).strip("[] \n\t") def __get_intro(self, soup): soupSelect = soup.select("#link-report") soupTemp = BeautifulSoup(str(soupSelect), "lxml", exclude_encodings="utf-8") intro = str(soupTemp.text).strip("[] \n\t") find = intro.find("(展开全部)") if find != -1: intro = intro[find + 6:] return intro.strip("[] \n\t") def __get_isbn(self, log, identifiers, soup, infos): isbn = identifiers.get("isbn", None) if isbn: return isbn pattern = re.compile(r"ISBN: (\d+)", re.IGNORECASE) isbn = '' for info in infos: match = pattern.match(info) if match: isbn = match.group(1) break return isbn
class Douban(Source): name = 'Douban Books' author = 'Li Fanxi' version = (2, 1, 2) minimum_calibre_version = (2, 80, 0) description = _('Downloads metadata and covers from Douban.com. ' 'Useful only for Chinese language books.') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', 'identifier:douban' ]) # language currently disabled supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = True DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/' options = (Option('include_subtitle_in_title', 'bool', True, _('Include subtitle in book title:'), _('Whether to append subtitle in the book title.')), ) def to_metadata(self, browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') title = XPath('descendant::atom:title') description = XPath('descendant::atom:summary') subtitle = XPath("descendant::db:attribute[@name='subtitle']") publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") booktag = XPath("descendant::db:tag/attribute::name") rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath( "descendant::atom:link[@rel='image']/attribute::href") def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text.replace('http://', 'https://') douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() subtitle = ': '.join([x.text for x in subtitle(entry_)]).strip() if self.prefs['include_subtitle_in_title'] and len(subtitle) > 0: title_ = title_ + ' - ' + subtitle authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'douban': douban_id} try: log.info(id_url) raw = get_details(browser, id_url, timeout) feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=etree.XMLParser(recover=True, no_network=True, resolve_entities=False)) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in [t.text for t in isbn(extra)]: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings if rating(extra): try: mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url(extra) if u: u = u[0].replace('/spic/', '/lpic/') # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u return mi # }}} def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: return ('douban', db, self.DOUBAN_BOOK_URL % db) # }}} def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ try: from urllib.parse import urlencode except ImportError: from urllib import urlencode SEARCH_URL = 'https://api.douban.com/book/subjects?' ISBN_URL = 'https://api.douban.com/book/subject/isbn/' SUBJECT_URL = 'https://api.douban.com/book/subject/' q = '' t = None isbn = check_isbn(identifiers.get('isbn', None)) subject = identifiers.get('douban', None) if isbn is not None: q = isbn t = 'isbn' elif subject is not None: q = subject t = 'subject' elif title or authors: def build_term(prefix, parts): return ' '.join(x for x in parts) title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) author_tokens = list( self.get_author_tokens(authors, only_first_author=True)) if author_tokens: q += ((' ' if q != '' else '') + build_term('author', author_tokens)) t = 'search' q = q.strip() if isinstance(q, type(u'')): q = q.encode('utf-8') if not q: return None url = None if t == "isbn": url = ISBN_URL + q elif t == 'subject': url = SUBJECT_URL + q else: url = SEARCH_URL + urlencode({ 'q': q, }) if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': if t == "isbn" or t == "subject": url = url + "?apikey=" + self.DOUBAN_API_KEY else: url = url + "&apikey=" + self.DOUBAN_API_KEY return url # }}} def download_cover( self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return br = self.browser log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() if cdata: result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None db = identifiers.get('douban', None) if db is None: isbn = identifiers.get('isbn', None) if isbn is not None: db = self.cached_isbn_to_identifier(isbn) if db is not None: url = self.cached_identifier_to_cover_url(db) return url # }}} def get_all_details( self, br, log, entries, abort, # {{{ result_queue, timeout): from lxml import etree for relevance, i in enumerate(entries): try: ans = self.to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): ans.source_relevance = relevance db = ans.identifiers['douban'] for isbn in getattr(ans, 'all_isbns', []): self.cache_isbn_to_identifier(isbn, db) if ans.has_douban_cover: self.cache_identifier_to_cover_url( db, ans.has_douban_cover) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: log.exception('Failed to get metadata for identify entry:', etree.tostring(i)) if abort.is_set(): break # }}} def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return br = self.browser try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode( clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) entries = entry(feed) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if not entries and identifiers and title and authors and \ not abort.is_set(): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) return None
class Douban(Source): name = 'Douban Books Proxy' author = 'Li Fanxi & Driftcrow' version = (2, 1, 2) minimum_calibre_version = (2, 80, 0) description = _('Downloads metadata and covers from Douban.com. ' 'Useful only for Chinese language books.') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', 'identifier:douban' ]) # language currently disabled supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = True DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d' DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/' DOUBAN_BOOK_PROXY = 'https://douban.uieee.com/v2/book/' # SEARCH_URL = 'https://douban.uieee.com/v2/book/search?' options = (Option('include_subtitle_in_title', 'bool', True, _('Include subtitle in book title:'), _('Whether to append subtitle in the book title.')), ) def to_metadata(self, browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars # log.info('entry_ is: ',entry_) id_url = entry_['url'] douban_id = entry_['id'] title_ = entry_['title'] subtitle = entry_['subtitle'] authors = [x.strip() for x in entry_['author'] if x] if not authors: authors = [_('Unknown')] mi = Metadata(title_, authors) mi.identifiers = {'douban': douban_id} mi.comments = entry_['summary'] mi.publisher = entry_['publisher'] # ISBN mi.isbn = entry_['isbn10'] mi.all_isbns = [entry_['isbn10'], entry_['isbn13']] # Tags mi.tags = [x['name'].strip() for x in entry_['tags']] # pubdate pubdate = entry_['pubdate'] if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings mi.rating = float(entry_['rating']['average']) / 2.0 # Cover mi.has_douban_cover = entry_['image'] return mi # }}} def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: return ('douban', db, self.DOUBAN_BOOK_URL % db) # }}} def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ try: from urllib.parse import urlencode except ImportError: from urllib import urlencode SEARCH_URL = self.DOUBAN_BOOK_PROXY + 'search?' ISBN_URL = self.DOUBAN_BOOK_PROXY + 'isbn/' SUBJECT_URL = self.DOUBAN_BOOK_PROXY + 'subject/' q = '' t = None isbn = check_isbn(identifiers.get('isbn', None)) subject = identifiers.get('douban', None) if isbn is not None: q = isbn t = 'isbn' elif subject is not None: q = subject t = 'subject' elif title or authors: def build_term(prefix, parts): return ' '.join(x for x in parts) title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) author_tokens = list( self.get_author_tokens(authors, only_first_author=True)) if author_tokens: q += ((' ' if q != '' else '') + build_term('author', author_tokens)) t = 'search' q = q.strip() if isinstance(q, type(u'')): q = q.encode('utf-8') if not q: return None url = None if t == "isbn": url = ISBN_URL + q elif t == 'subject': url = SUBJECT_URL + q else: url = SEARCH_URL + urlencode({ 'q': q, }) # if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': # if t == "isbn" or t == "subject": # url = url + "?apikey=" + self.DOUBAN_API_KEY # else: # url = url + "&apikey=" + self.DOUBAN_API_KEY return url # }}} def download_cover( self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return br = self.browser log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() if cdata: result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None db = identifiers.get('douban', None) if db is None: isbn = identifiers.get('isbn', None) if isbn is not None: db = self.cached_isbn_to_identifier(isbn) if db is not None: url = self.cached_identifier_to_cover_url(db) return url # }}} def get_all_details( self, br, log, entries, abort, # {{{ result_queue, timeout): # for relevance, i in enumerate(entries): for i in entries: try: ans = self.to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): # ans.source_relevance = relevance db = ans.identifiers['douban'] for isbn in getattr(ans, 'all_isbns', []): self.cache_isbn_to_identifier(isbn, db) if ans.has_douban_cover: self.cache_identifier_to_cover_url( db, ans.has_douban_cover) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: log.exception('Failed to get metadata for identify entry:', i) if abort.is_set(): break # }}} def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): import json from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars # XPath = partial(etree.XPath, namespaces=NAMESPACES) # entry = XPath('//atom:entry') query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query') return br = self.browser try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: # parser = etree.XMLParser(recover=True, no_network=True) # log.info('parser is ', parser) # feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), # strip_encoding_pats=True)[0], parser=parser) # log.info('feed is ', feed) # entries = entry(feed) entries = [] data = json.loads(raw) if data.has_key('books'): entries = data['books'] else: entries.append(data) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if not entries and identifiers and title and authors and \ not abort.is_set(): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) return None
class JD(Source): name = 'JD' version = (0, 0, 1) author = 'Lewix Liu' minimum_calibre_version = (3, 6, 0) description = _( 'Downloads metadata and covers from JD.com - A online book seller in China' ) capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'pubdate', 'comments', 'publisher', 'series', 'identifier:isbn', 'identifier:jd' ]) supports_gzip_transfer_encoding = True has_html_comments = True options = (Option('add_authors', 'bool', False, _('Add authors to search books:'), _('Whether to add authors to search books.')), ) @property def user_agent(self): # Pass in an index to random_user_agent() to test with a particular # user agent #return random_user_agent(allow_ie=False) return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0' def _get_book_url(self, sku): if sku: return 'https://item.jd.com/{}.html'.format(sku) def get_book_url(self, identifiers): # {{{ sku = identifiers.get('jd', None) if sku: return 'JD', sku, self._get_book_url(sku) # }}} def get_cached_cover_url(self, identifiers): # {{{ sku = identifiers.get('jd', None) if not sku: isbn = identifiers.get('isbn', None) if isbn is not None: sku = self.cached_isbn_to_identifier(isbn) return self.cached_identifier_to_cover_url(sku) # }}} def create_query(self, log, title=None, authors=None, identifiers={}): try: from urllib.parse import urlencode except ImportError: from urllib import urlencode import time BASE_URL = 'https://search.jd.com/Search?' keywords = [] isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: keywords.append(isbn) elif title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: keywords.extend(title_tokens) if self.prefs['add_authors']: author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: keywords.extend(author_tokens) if not keywords: return None word = (' '.join(keywords)).encode('utf-8') params = {'keyword': word, 'enc': 'utf-8', 'wp': word, 'book': 'y'} return BASE_URL + urlencode(params) # }}} def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): br = self.browser br.addheaders = [ ('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0' ), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ), ('Accept-Language', 'en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3'), ('Referer', 'https://www.jd.com/'), ('DNT', '1'), ('Connection', 'keep-alive'), ('Upgrade-Insecure-Requests', '1'), ('TE', 'Trailers') ] self.identifiers = identifiers if 'jd' in identifiers: items = [identifiers['jd']] else: query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: log.error('Insufficient metadata to construct query:', query) return log('Using query URL:', query) try: raw = br.open(query, timeout=timeout).read().decode('utf-8') except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) root = parse_html(raw) items = [] items_low_prio = [] items_tree = root.xpath('//*[@id="J_goodsList"]/ul/li') for item in items_tree: sku = item.get('data-sku') all_str = etree.tostring(item, method='text', encoding='utf-8') if all_str.find(u'自营') > 0: items.append(sku) else: items_low_prio.append(sku) items.extend(items_low_prio) if not items: log.error('Failed to get list of matching items') #log.debug('Response text:') #log.debug(raw) return if (not items and identifiers and title and authors and not abort.is_set()): if 'isbn' in identifiers: return identifiers.remove('jd') return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) if not items: return workers = [] items = items[:5] for i, item in enumerate(items): workers.append( Worker(item, i, result_queue, br.clone_browser(), timeout, log, self)) if not workers: return for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break # }}} def download_cover( self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) # TODO if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return br = self.browser log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url)
class GoogleImages(Source): name = 'Google Images' version = (1, 0, 2) minimum_calibre_version = (2, 80, 0) description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.') capabilities = frozenset(['cover']) can_get_multiple_covers = True supports_gzip_transfer_encoding = True options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'), _('The maximum number of covers to process from the Google search result')), Option('size', 'choices', 'svga', _('Cover size'), _('Search for covers larger than the specified size'), choices=OrderedDict(( ('any', _('Any size'),), ('l', _('Large'),), ('qsvga', _('Larger than %s')%'400x300',), ('vga', _('Larger than %s')%'640x480',), ('svga', _('Larger than %s')%'600x800',), ('xga', _('Larger than %s')%'1024x768',), ('2mp', _('Larger than %s')%'2 MP',), ('4mp', _('Larger than %s')%'4 MP',), ))), ) def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): if not title: return timeout = max(60, timeout) # Needs at least a minute title = ' '.join(self.get_title_tokens(title)) author = ' '.join(self.get_author_tokens(authors)) urls = self.get_image_urls(title, author, log, abort, timeout) self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log) @property def user_agent(self): return random_user_agent(allow_ie=False) def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.cleantext import clean_ascii_chars try: from urllib.parse import urlencode except ImportError: from urllib import urlencode from collections import OrderedDict ans = OrderedDict() br = self.browser q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')}) if isinstance(q, bytes): q = q.decode('utf-8') sz = self.prefs['size'] if sz == 'any': sz = '' elif sz == 'l': sz = 'isz:l,' else: sz = 'isz:lt,islt:%s,' % sz # See https://www.google.com/advanced_image_search to understand this # URL scheme url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz) log('Search URL: ' + url) raw = clean_ascii_chars(br.open(url).read().decode('utf-8')) root = parse_html(raw) results = root.xpath('//div/@data-tbnid') # could also use data-id # from calibre.utils.ipython import ipython # ipython({'root': root, 'raw': raw, 'url': url, 'results': results}) for tbnid in results: try: imgurl = imgurl_from_id(raw, tbnid) except Exception: continue if imgurl: ans[imgurl] = True return list(ans)
class ISBNDB(Source): name = 'ISBNDB' version = (1, 0, 0) minimum_calibre_version = (2, 80, 0) description = _('Downloads metadata from isbndb.com') capabilities = frozenset(['identify']) touched_fields = frozenset( ['title', 'authors', 'identifier:isbn', 'comments', 'publisher']) supports_gzip_transfer_encoding = True # Shortcut, since we have no cached cover URLS cached_cover_url_is_reliable = False options = (Option( 'isbndb_key', 'string', None, _('IsbnDB key:'), _('To use isbndb.com you have to sign up for a free account ' 'at isbndb.com and get an access key.')), ) config_help_message = ('<p>' + _( 'To use metadata from isbndb.com you must sign' ' up for a free account and get an isbndb key and enter it below.' ' Instructions to get the key are ' '<a href="%s">here</a>.')) % 'https://isbndb.com/api/v1/docs/keys' def __init__(self, *args, **kwargs): Source.__init__(self, *args, **kwargs) prefs = self.prefs prefs.defaults['key_migrated'] = False prefs.defaults['isbndb_key'] = None if not prefs['key_migrated']: prefs['key_migrated'] = True try: from calibre.customize.ui import config key = config['plugin_customization']['IsbnDB'] prefs['isbndb_key'] = key except: pass @property def isbndb_key(self): return self.prefs['isbndb_key'] def is_configured(self): return self.isbndb_key is not None def create_query(self, title=None, authors=None, identifiers={}): # {{{ from urllib import quote base_url = BASE_URL % self.isbndb_key isbn = check_isbn(identifiers.get('isbn', None)) q = '' if isbn is not None: q = 'index1=isbn&value1=' + isbn elif title or authors: tokens = [] title_tokens = list(self.get_title_tokens(title)) tokens += title_tokens author_tokens = self.get_author_tokens(authors, only_first_author=True) tokens += author_tokens tokens = [ quote(t.encode('utf-8') if isinstance(t, unicode) else t) for t in tokens ] q = '+'.join(tokens) q = 'index1=combined&value1=' + q if not q: return None if isinstance(q, unicode): q = q.encode('utf-8') return base_url + q # }}} def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): if not self.is_configured(): return query = self.create_query(title=title, authors=authors, identifiers=identifiers) if not query: err = 'Insufficient metadata to construct query' log.error(err) return err results = [] try: results = self.make_query(query, abort, title=title, authors=authors, identifiers=identifiers, timeout=timeout) except: err = 'Failed to make query to ISBNDb, aborting.' log.exception(err) return err if not results and identifiers.get('isbn', False) and title and authors and \ not abort.is_set(): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) for result in results: self.clean_downloaded_metadata(result) result_queue.put(result) def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers): from lxml import etree def tostring(x): if x is None: return '' return etree.tostring(x, method='text', encoding=unicode).strip() orig_isbn = identifiers.get('isbn', None) title_tokens = list(self.get_title_tokens(orig_title)) author_tokens = list(self.get_author_tokens(orig_authors)) results = [] def ismatch(title, authors): authors = lower(' '.join(authors)) title = lower(title) match = not title_tokens for t in title_tokens: if lower(t) in title: match = True break amatch = not author_tokens for a in author_tokens: if lower(a) in authors: amatch = True break if not author_tokens: amatch = True return match and amatch bl = feed.find('BookList') if bl is None: err = tostring(feed.find('errormessage')) raise ValueError('ISBNDb query failed:' + err) total_results = int(bl.get('total_results')) shown_results = int(bl.get('shown_results')) for bd in bl.xpath('.//BookData'): isbn = check_isbn(bd.get('isbn', None)) isbn13 = check_isbn(bd.get('isbn13', None)) if not isbn and not isbn13: continue if orig_isbn and orig_isbn not in {isbn, isbn13}: continue title = tostring(bd.find('Title')) if not title: continue authors = [] for au in bd.xpath('.//Authors/Person'): au = tostring(au) if au: if ',' in au: ln, _, fn = au.partition(',') au = fn.strip() + ' ' + ln.strip() authors.append(au) if not authors: continue comments = tostring(bd.find('Summary')) id_ = (title, tuple(authors)) if id_ in seen: continue seen.add(id_) if not ismatch(title, authors): continue publisher = tostring(bd.find('PublisherText')) if not publisher: publisher = None if publisher and 'audio' in publisher.lower(): continue mi = Metadata(title, authors) mi.isbn = isbn mi.publisher = publisher mi.comments = comments results.append(mi) return total_results, shown_results, results def make_query(self, q, abort, title=None, authors=None, identifiers={}, max_pages=10, timeout=30): from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.cleantext import clean_ascii_chars page_num = 1 parser = etree.XMLParser(recover=True, no_network=True) br = self.browser seen = set() candidates = [] total_found = 0 while page_num <= max_pages and not abort.is_set(): url = q.replace('&page_number=1&', '&page_number=%d&' % page_num) page_num += 1 raw = br.open_novisit(url, timeout=timeout).read() feed = etree.fromstring(xml_to_unicode( clean_ascii_chars(raw), strip_encoding_pats=True)[0], parser=parser) total, found, results = self.parse_feed(feed, seen, title, authors, identifiers) total_found += found candidates += results if total_found >= total or len(candidates) > 9: break return candidates
class Ozon(Source): name = 'OZON.ru' description = _('Downloads metadata and covers from OZON.ru') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'identifier:isbn', 'identifier:ozon', 'publisher', 'pubdate', 'comments', 'series', 'rating', 'languages' ]) # Test purpose only, test function does not like when sometimes some filed are empty # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', # 'publisher', 'pubdate', 'comments']) supports_gzip_transfer_encoding = True has_html_comments = True ozon_url = 'http://www.ozon.ru' # match any ISBN10/13. From "Regular Expressions Cookbook" isbnPattern = r'(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|'\ '[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?'\ '(?:[0-9]+[- ]?){2}[0-9X]' isbnRegex = re.compile(isbnPattern) optkey_strictmatch = 'strict_result_match' options = (Option( optkey_strictmatch, 'bool', False, _('Filter out less relevant hits from the search results'), _('Improve search result by removing less relevant hits. It can be useful to refine the search when there are many matches' )), ) def get_book_url(self, identifiers): # {{{ import urllib2 ozon_id = identifiers.get('ozon', None) res = None if ozon_id: url = '{}/context/detail/id/{}?partner={}'.format( self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId()) res = ('ozon', ozon_id, url) return res # }}} def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ from urllib import quote_plus # div_book -> search only books, ebooks and audio books search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText=' # for ozon.ru search we have to format ISBN with '-' isbn = _format_isbn(log, identifiers.get('isbn', None)) if isbn and not '-' in isbn: log.error( "%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)" % (self.name, isbn)) isbn = None ozonid = identifiers.get('ozon', None) qItems = set([ozonid, isbn]) unk = unicode(_('Unknown')).upper() if title and title != unk: qItems.add(title) if authors and authors != [unk]: qItems |= frozenset(authors) qItems.discard(None) qItems.discard('') qItems = map(_quoteString, qItems) searchText = u' '.join(qItems).strip() if isinstance(searchText, unicode): searchText = searchText.encode('utf-8') if not searchText: return None search_url += quote_plus(searchText) log.debug(u'search url: %r' % search_url) return search_url # }}} def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=60): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: err = u'Insufficient metadata to construct query' log.error(err) return err try: raw = self.browser.open_novisit(query).read() except Exception as e: log.exception(u'Failed to make identify query: %r' % query) return as_unicode(e) try: parser = etree.XMLParser(recover=True, no_network=True) feed = etree.fromstring(xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0], parser=parser) entries = feed.xpath( '//*[local-name()="SearchItems" or local-name()="ItemDetail"]') if entries: metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) # }}} def get_metadata(self, log, entries, title, authors, identifiers): # {{{ # some book titles have extra characters like this # TODO: make a twick #reRemoveFromTitle = None reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') title = unicode(title).upper() if title else '' if reRemoveFromTitle: title = reRemoveFromTitle.sub('', title) authors = map(_normalizeAuthorNameWithInitials, map(unicode.upper, map(unicode, authors))) if authors else None ozon_id = identifiers.get('ozon', None) #log.debug(u'ozonid: ', ozon_id) unk = unicode(_('Unknown')).upper() if title == unk: title = None if authors == [unk] or authors == []: authors = None def in_authors(authors, miauthors): for author in authors: for miauthor in miauthors: #log.debug(u'=> %s <> %s'%(author, miauthor)) if author in miauthor: return True return None def calc_source_relevance(mi): # {{{ relevance = 0 if title: mititle = unicode(mi.title).upper() if mi.title else '' if reRemoveFromTitle: mititle = reRemoveFromTitle.sub('', mititle) if title in mititle: relevance += 3 elif mititle: # log.debug(u'!!%s!'%mititle) relevance -= 3 else: relevance += 1 if authors: miauthors = map(unicode.upper, map( unicode, mi.authors)) if mi.authors else [] if (in_authors(authors, miauthors)): relevance += 3 elif u''.join(miauthors): # log.debug(u'!%s!'%u'|'.join(miauthors)) relevance -= 3 else: relevance += 1 if ozon_id: mozon_id = mi.identifiers['ozon'] if ozon_id == mozon_id: relevance += 100 if relevance < 0: relevance = 0 return relevance # }}} strict_match = self.prefs[self.optkey_strictmatch] metadata = [] for entry in entries: mi = self.to_metadata(log, entry) relevance = calc_source_relevance(mi) # TODO findout which is really used mi.source_relevance = relevance mi.relevance_in_source = relevance if not strict_match or relevance > 0: metadata.append(mi) #log.debug(u'added metadata %s %s.'%(mi.title, mi.authors)) else: log.debug( u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)' % (mi.title, u' '.join(mi.authors), relevance)) return metadata # }}} def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout): # {{{ req_isbn = identifiers.get('isbn', None) for mi in metadata: if abort.is_set(): break try: ozon_id = mi.identifiers['ozon'] try: self.get_book_details(log, mi, timeout) except: log.exception(u'Failed to get details for metadata: %s' % mi.title) all_isbns = getattr(mi, 'all_isbns', []) if req_isbn and all_isbns and check_isbn( req_isbn) not in all_isbns: log.debug(u'skipped, no requested ISBN %s found' % req_isbn) continue for isbn in all_isbns: self.cache_isbn_to_identifier(isbn, ozon_id) if mi.ozon_cover_url: self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url) self.clean_downloaded_metadata(mi) result_queue.put(mi) except: log.exception(u'Failed to get details for metadata: %s' % mi.title) # }}} def to_metadata(self, log, entry): # {{{ xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())' title = entry.xpath(xp_template.format('Name')) author = entry.xpath(xp_template.format('Author')) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.xpath(xp_template.format('ID')) mi.identifiers = {'ozon': ozon_id} mi.comments = entry.xpath(xp_template.format('Annotation')) mi.ozon_cover_url = None cover = entry.xpath(xp_template.format('Picture')) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) pub_year = entry.xpath(xp_template.format('Year')) if pub_year: mi.pubdate = toPubdate(log, pub_year) #log.debug('pubdate %s'%mi.pubdate) rating = entry.xpath(xp_template.format('ClientRatingValue')) if rating: try: #'rating', A floating point number between 0 and 10 # OZON raion N of 5, calibre of 10, but there is a bug? in identify mi.rating = float(rating) except: pass rating return mi # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None ozon_id = identifiers.get('ozon', None) if ozon_id is None: isbn = identifiers.get('isbn', None) if isbn is not None: ozon_id = self.cached_isbn_to_identifier(isbn) if ozon_id is not None: url = self.cached_identifier_to_cover_url(ozon_id) return url # }}} def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): # {{{ cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.debug('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return log.debug('Downloading cover from:', cached_url) try: cdata = self.browser.open_novisit(cached_url, timeout=timeout).read() if cdata: result_queue.put((self, cdata)) except Exception as e: log.exception(u'Failed to download cover from: %s' % cached_url) return as_unicode(e) # }}} def get_book_details(self, log, metadata, timeout): # {{{ from lxml import html, etree from calibre.ebooks.chardet import xml_to_unicode url = self.get_book_url(metadata.get_identifiers())[2] raw = self.browser.open_novisit(url, timeout=timeout).read() doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) xpt_tmpl_base = u'//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]' xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)' # series Серия/Серии series = doc.xpath(xpt_tmpl_a % u'Сери') if series: metadata.series = series #log.debug(u'Seria: ', metadata.series) xpt_isbn = u'normalize-space(' + xpt_tmpl_base + u')' isbn_str = doc.xpath(xpt_isbn % u'ISBN') if isbn_str: #log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str)) all_isbns = [ check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str) if _verifyISBNIntegrity(log, isbn) ] if all_isbns: metadata.all_isbns = all_isbns metadata.isbn = all_isbns[0] #log.debug(u'ISBN: ', metadata.isbn) publishers = doc.xpath(xpt_tmpl_a % u'Издатель') if publishers: metadata.publisher = publishers #log.debug(u'Publisher: ', metadata.publisher) xpt_lang = u'substring-after(normalize-space(//text()[contains(normalize-space(.), "%s")]), ":")' displ_lang = None langs = doc.xpath(xpt_lang % u'Язык') if langs: lng_splt = langs.split(u',') if lng_splt: displ_lang = lng_splt[0].strip() metadata.language = _translageLanguageToCode(displ_lang) #log.debug(u'Language: ', metadata.language) # can be set before from xml search responce if not metadata.pubdate: xpt = u'substring-after(' + xpt_isbn + u',";")' yearIn = doc.xpath(xpt % u'ISBN') if yearIn: matcher = re.search(r'\d{4}', yearIn) if matcher: metadata.pubdate = toPubdate(log, matcher.group(0)) #log.debug(u'Pubdate: ', metadata.pubdate) # overwrite comments from HTML if any xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]' # noqa from lxml.etree import ElementBase comment_elem = doc.xpath(xpt) if comment_elem: comments = u'' for node in comment_elem: if isinstance(node, ElementBase): comments += unicode(etree.tostring(node, encoding=unicode)) elif isinstance(node, basestring) and node.strip(): comments += unicode(node) + u'\n' if comments and (not metadata.comments or len(comments) > len(metadata.comments)): metadata.comments = comments else: log.debug( 'HTML book description skipped in favor of search service xml response' ) else: log.debug('No book description found in HTML')
class Douban(Source): name = "Douban Books Reload" author = "Li Fanxi, xcffl, jnozsc" version = (4, 0, 0) minimum_calibre_version = (2, 80, 0) description = _( "Downloads metadata and covers from Douban.com. " "Useful only for Chinese language books." ) capabilities = frozenset(["identify", "cover"]) touched_fields = frozenset( [ "title", "authors", "tags", "pubdate", "comments", "publisher", "identifier:isbn", "rating", "identifier:douban", ] ) # language currently disabled supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = True DOUBAN_API_URL = "https://api.douban.com/v2/book/search" DOUBAN_BOOK_URL = "https://book.douban.com/subject/%s/" options = ( Option( "include_subtitle_in_title", "bool", True, _("Include subtitle in book title:"), _("Whether to append subtitle in the book title."), ), Option( "apikey", "string", "", _("douban api v2 apikey"), _("douban api v2 apikey") ), ) def to_metadata(self, browser, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow douban_id = entry_.get("id") title = entry_.get("title") description = entry_.get("summary") # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get("publisher") isbn = entry_.get("isbn13") # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get("pubdate") authors = entry_.get("author") book_tags = entry_.get("tags") rating = entry_.get("rating") cover_url = entry_.get("images", {}).get("large") series = entry_.get("series") if not authors: authors = [_("Unknown")] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {"douban": douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(""), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = [tag["name"] for tag in book_tags] # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error("Failed to parse pubdate %r" % pubdate) # Ratings if rating: try: mi.rating = float(rating["average"]) / 2.0 except: log.exception("Failed to parse rating") mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find("book-default") == -1: mi.has_douban_cover = u # Series if series: mi.series = series["title"] return mi # }}} def get_book_url(self, identifiers): # {{{ db = identifiers.get("douban", None) if db is not None: return ("douban", db, self.DOUBAN_BOOK_URL % db) # }}} def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ try: from urllib.parse import urlencode except ImportError: from urllib import urlencode SEARCH_URL = "https://api.douban.com/v2/book/search?count=10&" ISBN_URL = "https://api.douban.com/v2/book/isbn/" SUBJECT_URL = "https://api.douban.com/v2/book/" q = "" t = None isbn = check_isbn(identifiers.get("isbn", None)) subject = identifiers.get("douban", None) if isbn is not None: q = isbn t = "isbn" elif subject is not None: q = subject t = "subject" elif title or authors: def build_term(prefix, parts): return " ".join(x for x in parts) title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term("title", title_tokens) author_tokens = list( self.get_author_tokens(authors, only_first_author=True) ) if author_tokens: q += (" " if q != "" else "") + build_term("author", author_tokens) t = "search" q = q.strip() # if isinstance(q, type("")): # q = q.encode("utf-8") q = str(q) if not q: return None url = None if t == "isbn": url = ISBN_URL + q elif t == "subject": url = SUBJECT_URL + q else: url = SEARCH_URL + urlencode( { "q": q, } ) if self.prefs.get("apikey"): if t == "isbn" or t == "subject": url = url + "?apikey=" + self.prefs["apikey"] else: url = url + "&apikey=" + self.prefs["apikey"] return url # }}} def download_cover( self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False, ): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info("No cached cover found, running identify") rq = Queue() self.identify( log, rq, abort, title=title, authors=authors, identifiers=identifiers ) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort( key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers ) ) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info("No cover found") return if abort.is_set(): return br = self.browser log("Downloading cover from:", cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() if cdata: result_queue.put((self, cdata)) except: log.exception("Failed to download cover from:", cached_url) # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None db = identifiers.get("douban", None) if db is None: isbn = identifiers.get("isbn", None) if isbn is not None: db = self.cached_isbn_to_identifier(isbn) if db is not None: url = self.cached_identifier_to_cover_url(db) return url # }}} def get_all_details(self, br, log, entries, abort, result_queue, timeout): # {{{ for relevance, i in enumerate(entries): try: ans = self.to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): ans.source_relevance = relevance db = ans.identifiers["douban"] for isbn in getattr(ans, "all_isbns", []): self.cache_isbn_to_identifier(isbn, db) if ans.has_douban_cover: self.cache_identifier_to_cover_url(db, ans.has_douban_cover) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: log.exception("Failed to get metadata for identify entry:", i) if abort.is_set(): break # }}} def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30, ): # check apikey if not self.prefs.get("apikey"): return import json query = self.create_query( log, title=title, authors=authors, identifiers=identifiers ) if not query: log.error("Insufficient metadata to construct query") return br = self.browser try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception("Failed to make identify query: %r" % query) return as_unicode(e) try: j = json.loads(raw) except Exception as e: log.exception("Failed to parse identify results") return as_unicode(e) if "books" in j: entries = j["books"] else: entries = [] entries.append(j) if not entries and identifiers and title and authors and not abort.is_set(): return self.identify( log, result_queue, abort, title=title, authors=authors, timeout=timeout ) # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) return None
class GoodreadsAPI(Source): """ Goodreads API """ name = 'GoodreadsAPI' description = 'GoodreadsAPI' author = 'botmtl' version = (0, 0, 2) minimum_calibre_version = (0, 8, 1) capabilities = frozenset(['identify']) has_html_comments = True supports_gzip_transfer_encoding = True BASE_URL = 'https://www.goodreads.com' ISBN_TO_BOOKID = 'https://www.goodreads.com/book/isbn_to_id/{0}?key={1}' BOOK_SHOW = 'https://www.goodreads.com/book/show/{0}.xml?key={1}' BOOK_SHOW_ISBN = 'https://www.goodreads.com/book/isbn/{0}.xml?key={1}' # name, type_, default, label, desc, choices=None options = [ Option(name='GOODREADS_API_KEY', type_='string', default='', label='GOODREADS_API_KEY', desc='GOODREADS_API_KEY'), Option( name='SHELF_COUNT_THRESHOLD', type_='number', default=2, label='SHELF_COUNT_THRESHOLD:', desc= 'How many shelves does this book have to be in to be considered a tag.' ), Option(name='NEVER_REPLACE_AMAZONID', type_='bool', default=True, label='NEVER_REPLACE_AMAZONID:', desc='NEVER_REPLACE_AMAZONID'), Option(name='NEVER_REPLACE_ISBN', type_='bool', default=True, label='NEVER_REPLACE_ISBN:', desc='NEVER_REPLACE_ISBN'), Option(name='CHECK_AMAZONID_VALIDITY', type_='bool', default=True, label='CHECK_AMAZONID_VALIDITY:', desc='Not Implemented.'), Option( name='ADD_THESE_TAGS', type_='string', default='GoodreadsAPI', label='Additioal tags:', desc= 'A comma separated list of tags to add on a sucessful metadata download.' ), Option( u'DISABLE_TITLE_AUTHOR_SEARCH', u'bool', False, u'Disable title/author search:', u'Only books with identifiers will have a chance for to find a match with the metadata provider.' ) ] def __init__(self, *args, **kwargs): """ Args: args: kwargs: """ self.touched_fields = frozenset([ 'title', 'authors', 'identifier:goodreads', 'identifier:amazon', 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate', 'tags', 'series' ]) Source.__init__(self, *args, **kwargs) def is_configured(self): # type: () -> bool """ :return: False if your plugin needs to be configured before it can be used. For example, it might need a username/password/API key. :rtype: bool """ if self.prefs['GOODREADS_API_KEY']: return True return False def get_cached_cover_url(self, identifiers): """ :param identifiers: list(unicode) or list(str) :return: Text: url """ url = None if identifiers.get('goodreads'): url = self.cached_identifier_to_cover_url( identifiers.get('goodreads')) return url def clean_downloaded_metadata(self, mi): """ Overridden from the calibre default so that we can stop this plugin messing with the tag casing coming from Goodreads """ series_in_title = r'\s*{0}\s*#?{1}\s*'.format(mi.series, mi.series_index) if mi.title: mi.title = re.sub(series_in_title + r'[:-]', r'', mi.title, flags=re.IGNORECASE).strip() mi.title = re.sub(r'(?:[^:-]+)[:-]' + series_in_title, r'', mi.title, flags=re.IGNORECASE).strip() mi.title = re.sub(r'\(.*?\)', r'', mi.title, flags=re.IGNORECASE).strip() mi.title = re.sub(r'\[.*?\]', r'', mi.title, flags=re.IGNORECASE).strip() mi.title = fixcase(mi.title) mi.title = mi.title.strip() if mi.authors: mi.authors = fixauthors(mi.authors) try: plugin_prefs = JSONConfig('plugins/Quality Check') from calibre_plugins.quality_check.config import STORE_OPTIONS, KEY_AUTHOR_INITIALS_MODE, AUTHOR_INITIALS_MODES initials_mode = plugin_prefs[STORE_OPTIONS].get( KEY_AUTHOR_INITIALS_MODE, u'A. B.') from calibre_plugins.quality_check.helpers import get_formatted_author_initials mi.authors = [ get_formatted_author_initials(initials_mode, author) for author in mi.authors ] except: pass def _autocomplete_api(self, search_terms, timeout=10): # type: (Text, int) -> dict or None """ :param timeout: int: urlopen will raise an exception :param search_terms: unicode: search term(s) :return: dict: a dictionnary representing the first book found by the api. """ from urllib2 import urlopen import json search_terms = search_terms.strip() if search_terms is None: return None search_terms = search_terms.replace(' and ', ' ').replace(' or ', ' ').replace( ' & ', ' ').replace('-', ' ') search_terms = search_terms.replace(' ', ' ') search_terms = search_terms.strip().replace(' ', '+') autocomplete_api_url = "https://www.goodreads.com/book/auto_complete?format=json&q=" self.log.info('autocomplete url:', autocomplete_api_url, search_terms) response = urlopen(autocomplete_api_url + search_terms, timeout=timeout).read() if response is not None: result = json.loads(response) if len(result) >= 1: return result[0]['bookId'] return None def identify(self, log, result_queue, abort, title=None, authors=None, identifiers=None, timeout=30): """ :param log: :param result_queue: :param abort: :param title: :param authors: :param identifiers: :param timeout: :return: """ if not identifiers: identifiers = {} goodreads_id = None # noinspection PyAttributeOutsideInit self.log = log if identifiers.get('amazon'): try: self.log.info('ISBN_TO_BOOKID', identifiers.get('amazon')) request = GoodreadsAPI.ISBN_TO_BOOKID.format( identifiers.get('amazon'), self.prefs['GOODREADS_API_KEY']) goodreads_id = urlopen(request).read() except: pass if not goodreads_id and identifiers.get('goodreads'): goodreads_id = identifiers.get('goodreads') if not goodreads_id and identifiers.get('isbn'): try: self.log.info('ISBN_TO_BOOKID', identifiers.get('isbn')) request = GoodreadsAPI.ISBN_TO_BOOKID.format( identifiers.get('isbn'), self.prefs['GOODREADS_API_KEY']) goodreads_id = urlopen(request).read() except: pass if not goodreads_id and title and not self.prefs[ 'DISABLE_TITLE_AUTHOR_SEARCH']: self.log.info( 'AUTOCOMPLETEAPI:', ' '.join(self.get_title_tokens(title)) + ' ' + ' '.join(self.get_author_tokens(authors))) goodreads_id = self._autocomplete_api( ' '.join(self.get_title_tokens(title)) + ' ' + ' '.join(self.get_author_tokens(authors)), 10) if goodreads_id: try: self.log.info('BOOK_SHOW ', goodreads_id) request_book = GoodreadsAPI.BOOK_SHOW.format( goodreads_id, self.prefs['GOODREADS_API_KEY']) response = urlopen(request_book).read() response = re.sub(re.compile(r'>\s+<', re.MULTILINE), '><', response) response = re.sub(re.compile(r'\r\n', re.MULTILINE), r'', response) mi = self._GoodreadsBook_to_Metadata( _GoodreadsBook(str(response), self.prefs['SHELF_COUNT_THRESHOLD'])) except Exception as e: self.log.error(e.message) self.log.error(traceback.print_stack()) traceback.print_exc() return self.clean_downloaded_metadata(mi) result_queue.put(mi) return None def _GoodreadsBook_to_Metadata(self, book): # type: (_GoodreadsBook) -> Metadata """ :param book: _GoodreadsBook: book :return: Metadata: Metadata """ mi = Metadata(book.title, book.authors) mi.source_relevance = 0 mi.set_identifier('goodreads', book.id) if self.prefs['NEVER_REPLACE_ISBN'] and mi.get_identifiers().get( 'isbn'): mi.set_identifier('isbn', '') if book.asin and not self.prefs['NEVER_REPLACE_AMAZONID']: mi.set_identifier('amazon', book.asin) if book.isbn and not self.prefs['NEVER_REPLACE_ISBN']: try: if len(book.isbn) == 10: mi.isbn = check_isbn13(_ISBNConvert.convert(book.isbn)) else: mi.isbn = check_isbn13(book.isbn) except: self.log.error("ISBN CONVERSION ERROR:", book.isbn) self.log.exception() if book.image_url: self.log.info('cache_identifier_to_cover_url:', book.asin, ':', book.image_url) self.cache_identifier_to_cover_url(book.id, book.image_url) if book.publisher: self.log.info('book.publisher is:', book.publisher) mi.publisher = book.publisher if book.pubdate: self.log.info('book.pubdate is:', book.pubdate.strftime('%Y-%m-%d')) mi.pubdate = book.pubdate if book.comments: self.log.info('book.editorial_review is:', book.comments) mi.comments = book.comments tags = self.prefs['ADD_THESE_TAGS'].split(',') tags.extend(book.tags) # tag_mappings = JSONConfig('plugins/GenreMappings')['genreMappings'] # mi.tags = list(set(sorted(filter(lambda x: tag_mappings.get(x, x), tags)))) if book.series: mi.series = book.series self.log.info(u'series:', book.series) if book.series_index: mi.series_index = book.series_index self.log.info(u'series_index:', "{0:.2f}".format(book.series_index)) else: mi.series_index = 0 if book.average_rating: mi.rating = book.average_rating self.clean_downloaded_metadata(mi) return mi def cli_main(self, args): """ :type args: list :param args: args """ pass # noinspection PyDefaultArgument def download_cover(self, log, result_queue, abort, title=None, authors=[], identifiers={}, timeout=30, get_best_cover=False): # type: (ThreadSafeLog, Queue, Event, Text, list(), dict(), int, bool) -> Text """ Download a cover and put it into result_queue. The parameters all have the same meaning as for :meth:`identify`. Put (self, cover_data) into result_queue. This method should use cached cover URLs for efficiency whenever possible. When cached data is not present, most plugins simply call identify and use its results. If the parameter get_best_cover is True and this plugin can get multiple covers, it should only get the best one. :type result_queue: Queue :param log: ThreadSafeLog: log :param result_queue: Queue: results :param abort: Event: if is_set,abort :param title: Optional[unicode]: title :param authors: Optional[List]: authors :param timeout: int: timeout :param get_best_cover: bool:cover :return: :type identifiers: Optional[Dict]: identifiers """ # noinspection PyAttributeOutsideInit self.log = log cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: self.log.info(u'No cached cover found, running identify') try: rq = Queue() self.identify(self.log, rq, abort, title, authors, identifiers) cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: return u'Download cover failed. Could not identify.' except Exception as e: return e.message if abort.is_set(): return "abort" br = self.browser self.log.info(u'Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() result_queue.put((self, cdata)) except: self.log.error(u'Failed to download cover from:', cached_url) return u'Failed to download cover from:%s' % cached_url # }}}
class MySource(Source): options = [ Option( 'clean_title', 'bool', True, _('Clean title'), _('Enable this option clean title metadata and make it "Title Case".' )) ] # Plugin Options has_html_comments = True supports_gzip_transfer_encoding = False # My Options idkey = None maxresults = 5 sleep_time = 0.5 worker_class = None abstract_title = None def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): md = self.worker_class(self.browser, timeout) d = {} idval = identifiers.get(self.idkey, None) isbn = identifiers.get('isbn', None) if idval: d['id'] = idval if isbn: d['isbn'] = isbn if title: d['title'] = title if authors: d['authors'] = authors md.query(d, maxresults=self.maxresults) while not abort.is_set(): md.join(0.2) if abort.is_set(): break if not md.is_alive(): break time.sleep(self.sleep_time) if not abort.is_set(): for i in range(0, len(md.ans)): mi = self.data2mi(md.ans[i]) mi.source_relevance = i # Less means more relevant. mi.isbn = check_isbn(mi.isbn) result_queue.put(mi) return None def identify_results_keygen(self, title=None, authors=None, identifiers={}): """ Returns a key to sort search results. Lesser value means more relevance.""" query = dict([('title', title), ('authors', authors)] + identifiers.items()) def mi_distance(mi): mifields = dict([('title', mi.title), ('authors', mi.authors)] + mi.identifiers.items()) return metadata_distance(query, mifields, idkey=self.idkey) return mi_distance def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index( item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi def format_abstract(self, abstract): return '<h3>%s</h3>\n %s' % (self.abstract_title, abstract) def format_paragraph(self, par): par = escape(par) par = re.sub(r"{\\it(.*?)}", "<i>\g<1></i>", par) par = re.sub("\s+", ' ', par) return '<p>%s</p>' % par def surname(self, au): return author_to_author_sort(au).split(',')[0] def format_series_index(self, volume, number): """Formats a series index of the form 4.03 indicating number 3 in volume 4.""" v = 0.0 n = 0.0 if volume: try: v = float(volume) except ValueError: v = 0.0 if number: try: n = float(number) except ValueError: n = 0.0 if volume and number: return v + n / 100. elif volume: return v elif number: return n else: return 0.
class Antikvarium_hu(Source): name = 'Antikvarium_hu' description = _('Downloads metadata and cover from antikvarium.hu') author = 'Hoffer Csaba & Kloon & otapi' version = (2, 0, 3) minimum_calibre_version = (0, 8, 0) capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'identifier:isbn', 'identifier:antik_hu', 'tags', 'comments', 'publisher', 'pubdate', 'series', 'language', 'languages' ]) has_html_comments = False supports_gzip_transfer_encoding = False KEY_MAX_DOWNLOADS = 'maxDownloads' options = [ Option( KEY_MAX_DOWNLOADS, 'number', 3, _('Maximum number of books to get'), _('The maximum number of books to process from the Antikvarium search result' )), ] BASE_URL = 'https://www.antikvarium.hu' BOOK_URL = BASE_URL + '/konyv/' def create_query(self, log, title=None, authors=None, identifiers={}): if title is not None: search_title = urllib.quote(title.encode('utf-8')) else: search_title = '' log.info(' Title: %s' % search_title) if authors is not None: search_author = urllib.quote(authors[0].encode('utf-8')) else: search_author = '' log.info(' Author: %s' % search_author) search_page = "https://www.antikvarium.hu/index.php?type=search&kc=%s&sz=%s&he=0&jk=0&reszletes=1&rend=kiadasevecsokk&oldaldb=60&kapelol=0&nezet=li&elist=egyebadat&interfaceid=102&oldalcount=1" % ( search_title, search_author) return search_page def get_cached_cover_url(self, identifiers): url = None antik_id = identifiers.get('antik_hu', None) if antik_id is None: isbn = identifiers.get('isbn', None) if isbn is not None: antik_id = self.cached_isbn_to_identifier(isbn) if antik_id is not None: url = self.cached_identifier_to_cover_url(antik_id) return url def cached_identifier_to_cover_url(self, id_): with self.cache_lock: url = self._get_cached_identifier_to_cover_url(id_) if not url: # Try for a "small" image in the cache url = self._get_cached_identifier_to_cover_url('small/' + id_) return url def _get_cached_identifier_to_cover_url(self, id_): # This must only be called once we have the cache lock url = self._identifier_to_cover_url_cache.get(id_, None) if not url: # We could not get a url for this particular B&N id # However we might have one for a different isbn for this book # Barnes & Noble are not very consistent with their covers and # it could be that the particular ISBN we chose does not have # a large image but another ISBN we retrieved does. key_prefix = id_.rpartition('/')[0] for key in self._identifier_to_cover_url_cache.keys(): if key.startswith('key_prefix'): return self._identifier_to_cover_url_cache[key] return url def identify(self, log, result_queue, abort, title, authors, identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' matches = [] antik_id = identifiers.get('antik_hu', None) isbn = check_isbn(identifiers.get('isbn', None)) br = browser() log.info(u'\nTitle:%s\nAuthors:%s\n' % (title, authors)) if antik_id: matches.append('%s%s' % (Antikvarium_hu.BOOK_URL, antik_id)) else: if isbn: matches.append( 'https://www.antikvarium.hu/index.php?type=search&isbn=%s' % (isbn)) else: query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query') return try: log.info('Querying: %s' % query) response = br.open(query) except Exception as e: if isbn and callable(getattr(e, 'getcode', None)) and e.getcode() == 404: # We did a lookup by ISBN but did not find a match # We will fallback to doing a lookup by title author log.info('Failed to find match for ISBN: %s' % isbn) else: err = 'Failed to make identify query: %r' % query log.exception(err) return as_unicode(e) try: raw = response.read().strip() raw = raw.decode('utf-8', errors='replace') if not raw: log.error('Failed to get raw result for query: %r' % query) return root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse Antikvarium.hu page for query: %r' % query log.exception(msg) return msg self._parse_search_results(log, title, authors, root, matches, timeout) if abort.is_set(): return if not matches: if identifiers and title and authors: log.info( 'No matches found with identifiers, retrying using only' ' title and authors') return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r' % query) return from calibre_plugins.antikvarium_hu.worker import Worker workers = [ Worker(url, result_queue, br, log, i, self) for i, url in enumerate(matches) ] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None def _parse_search_results(self, log, title, authors, root, matches, timeout): results = root.xpath('//*[@class="book-data-holder-list"]') max_results = self.prefs[Antikvarium_hu.KEY_MAX_DOWNLOADS] for result in results: urls = result.xpath('//*[@id="searchResultKonyvCim-listas"]/@href') book_url = 'https://www.antikvarium.hu/' + urls[0] log.info('Book URL: %r' % book_url) titlenode = result.xpath( '//*[@id="searchResultKonyvCim-listas"]/span')[0] n_title = '%s' % titlenode.text_content() log.info('Book title: %s' % n_title) authorenode = result.xpath( '//*[@id="searchResultKonyvSzerzo-listas"]')[0] etree.strip_tags(authorenode, 'snap') n_author = '%s' % authorenode.text_content() log.info('Book author: %s' % n_author) if title: if title.lower() not in n_title.lower() and self.strip_accents( title) not in self.strip_accents(n_title): continue if authors: author1 = authors[0] authorsplit = author1.split(" ") author2 = author1 if len(authorsplit) > 1: author2 = '%s %s' % (authorsplit[1], authorsplit[0]) log.info('author1: %s' % author1) log.info('n_author: %s' % n_author) log.info('author2: %s' % author2) if author1.lower() not in n_author.lower() \ and self.strip_accents(author1) not in self.strip_accents(n_author) \ and author2.lower() not in n_author.lower() \ and self.strip_accents(author2) not in self.strip_accents(n_author): continue matches.append(book_url) if len(matches) >= max_results: return def strip_accents(self, s): symbols = (u"öÖüÜóÓőŐúÚéÉáÁűŰíÍ", u"oOuUoOoOuUeEaAuUiI") tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)]) return s.translate(tr).lower() def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return br = self.browser log.info('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url)
class Ozon(Source): name = 'OZON.ru' description = _('Downloads metadata and covers from OZON.ru (updated)') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'identifier:isbn', 'identifier:ozon', 'publisher', 'pubdate', 'comments', 'series', 'rating', 'languages' ]) # Test purpose only, test function does not like when sometimes some filed are empty # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon', # 'publisher', 'pubdate', 'comments']) supports_gzip_transfer_encoding = True has_html_comments = True ozon_url = 'http://www.ozon.ru' # match any ISBN10/13. From "Regular Expressions Cookbook" isbnPattern = r'(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|' \ '[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?' \ '(?:[0-9]+[- ]?){2}[0-9X]' isbnRegex = re.compile(isbnPattern) optkey_strictmatch = 'strict_result_match' options = (Option( optkey_strictmatch, 'bool', False, _('Filter out less relevant hits from the search results'), _('Improve search result by removing less relevant hits. It can be useful to refine the search when there are many matches' )), ) def get_book_url(self, identifiers): # {{{ import urllib2 ozon_id = identifiers.get('ozon', None) res = None if ozon_id: # no affiliateId is used in search/detail url = '{}/context/detail/id/{}'.format(self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId()) res = ('ozon', ozon_id, url) return res # }}} def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ from urllib import quote_plus # div_book -> search only books, ebooks and audio books search_url = self.ozon_url + '/?context=search&group=div_book&text=' # for ozon.ru search we have to format ISBN with '-' isbn = _format_isbn(log, identifiers.get('isbn', None)) if isbn and '-' not in isbn: log.error( "%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)" % (self.name, isbn)) isbn = None ozonid = identifiers.get('ozon', None) qItems = set([ozonid, isbn]) # Added Russian variant of 'Unknown' unk = [_('Unknown').upper(), 'Неизв.'.upper(), icu_upper('Неизв.')] if title and title not in unk: qItems.add(title) if authors: for auth in authors: if icu_upper(auth) not in unk: qItems.add(auth) qItems.discard(None) qItems.discard('') searchText = u' '.join(qItems).strip() if isinstance(searchText, unicode): searchText = searchText.encode('utf-8') if not searchText: return None search_url += quote_plus(searchText) log.debug(u'search url: %r' % search_url) return search_url # }}} def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=90): # {{{ from calibre.ebooks.chardet import xml_to_unicode from HTMLParser import HTMLParser from lxml import etree, html if not self.is_configured(): return query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if not query: err = u'Insufficient metadata to construct query' log.error(err) return err try: raw = self.browser.open_novisit(query).read() except Exception as e: log.exception(u'Failed to make identify query: %r' % query) return as_unicode(e) try: doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) entries_block = doc.xpath(u'//div[@class="bSearchResult"]') if entries_block: entries = doc.xpath( u'//div[contains(@itemprop, "itemListElement")]') # for entry in entries: # log.debug('entries %s' % entree.tostring(entry)) metadata = self.get_metadata(log, entries, title, authors, identifiers) self.get_all_details(log, metadata, abort, result_queue, identifiers, timeout) else: # Redirect page: trying to extract ozon_id from javascript data h = HTMLParser() entry_string = (h.unescape( unicode(etree.tostring(doc, pretty_print=True)))) id_title_pat = re.compile( u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)') # result containing ozon_id and entry_title entry_info = re.search(id_title_pat, entry_string) ozon_id = entry_info.group(1) if entry_info else None entry_title = entry_info.group(2) if entry_info else None if ozon_id: metadata = self.to_metadata_for_single_entry( log, ozon_id, entry_title, authors) identifiers['ozon'] = ozon_id self.get_all_details(log, [metadata], abort, result_queue, identifiers, timeout, cachedPagesDict={}) else: log.error('No SearchResults in Ozon.ru response found') except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) # }}} def to_metadata_for_single_entry(self, log, ozon_id, title, authors): # {{{ # parsing javascript data from the redirect page mi = Metadata(title, authors) mi.identifiers = {'ozon': ozon_id} return mi # }}} def get_metadata(self, log, entries, title, authors, identifiers): # {{{ # some book titles have extra characters like this reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]') title = unicode(title).upper() if title else '' if reRemoveFromTitle: title = reRemoveFromTitle.sub('', title) authors = map(_normalizeAuthorNameWithInitials, map(unicode.upper, map(unicode, authors))) if authors else None ozon_id = identifiers.get('ozon', None) # log.debug(u'ozonid: ', ozon_id) unk = unicode(_('Unknown')).upper() if title == unk: title = None if authors == [unk] or authors == []: authors = None def in_authors(authors, miauthors): for author in authors: for miauthor in miauthors: # log.debug(u'=> %s <> %s'%(author, miauthor)) if author in miauthor: return True return None def calc_source_relevance(mi): # {{{ relevance = 0 if title: mititle = unicode(mi.title).upper() if mi.title else '' if reRemoveFromTitle: mititle = reRemoveFromTitle.sub('', mititle) if title in mititle: relevance += 3 elif mititle: # log.debug(u'!!%s!'%mititle) relevance -= 3 else: relevance += 1 if authors: miauthors = map(unicode.upper, map( unicode, mi.authors)) if mi.authors else [] # log.debug('Authors %s vs miauthors %s'%(','.join(authors), ','.join(miauthors))) if (in_authors(authors, miauthors)): relevance += 3 elif u''.join(miauthors): # log.debug(u'!%s!'%u'|'.join(miauthors)) relevance -= 3 else: relevance += 1 if ozon_id: mozon_id = mi.identifiers['ozon'] if ozon_id == mozon_id: relevance += 100 if relevance < 0: relevance = 0 return relevance # }}} strict_match = self.prefs[self.optkey_strictmatch] metadata = [] for entry in entries: mi = self.to_metadata(log, entry) relevance = calc_source_relevance(mi) # TODO findout which is really used mi.source_relevance = relevance mi.relevance_in_source = relevance if not strict_match or relevance > 0: # getting rid of a random book that shows up in results if not (mi.title == 'Unknown'): metadata.append(mi) # log.debug(u'added metadata %s %s.'%(mi.title, mi.authors)) else: log.debug( u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)' % (mi.title, u' '.join(mi.authors), relevance)) return metadata # }}} def get_all_details(self, log, metadata, abort, result_queue, identifiers, timeout, cachedPagesDict={}): # {{{ req_isbn = identifiers.get('isbn', None) for mi in metadata: if abort.is_set(): break try: ozon_id = mi.identifiers['ozon'] try: self.get_book_details( log, mi, timeout, cachedPagesDict[ozon_id] if cachedPagesDict and ozon_id in cachedPagesDict else None) except: log.exception(u'Failed to get details for metadata: %s' % mi.title) all_isbns = getattr(mi, 'all_isbns', []) if req_isbn and all_isbns and check_isbn( req_isbn) not in all_isbns: log.debug(u'skipped, no requested ISBN %s found' % req_isbn) continue for isbn in all_isbns: self.cache_isbn_to_identifier(isbn, ozon_id) if mi.ozon_cover_url: self.cache_identifier_to_cover_url(ozon_id, mi.ozon_cover_url) self.clean_downloaded_metadata(mi) result_queue.put(mi) except: log.exception(u'Failed to get details for metadata: %s' % mi.title) # }}} def to_metadata(self, log, entry): # {{{ title = unicode( entry.xpath( u'normalize-space(.//span[@itemprop="name"][1]/text())')) # log.debug(u'Title: -----> %s' % title) author = unicode( entry.xpath(u'normalize-space(.//a[contains(@href, "person")])')) # log.debug(u'Author: -----> %s' % author) norm_authors = map(_normalizeAuthorNameWithInitials, map(unicode.strip, unicode(author).split(u','))) mi = Metadata(title, norm_authors) ozon_id = entry.get('data-href').split('/')[-2] if ozon_id: mi.identifiers = {'ozon': ozon_id} # log.debug(u'ozon_id: -----> %s' % ozon_id) mi.ozon_cover_url = None cover = entry.xpath(u'normalize-space(.//img[1]/@src)') log.debug(u'cover: -----> %s' % cover) if cover: mi.ozon_cover_url = _translateToBigCoverUrl(cover) # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url) pub_year = None pub_year_block = entry.xpath( u'.//div[@class="bOneTileProperty"]/text()') year_pattern = re.compile('\d{4}') if pub_year_block: pub_year = re.search(year_pattern, pub_year_block[0]) if pub_year: mi.pubdate = toPubdate(log, pub_year.group()) # log.debug('pubdate %s' % mi.pubdate) mi.rating = self.get_rating(log, entry) # if not mi.rating: # log.debug('No rating found. ozon_id:%s'%ozon_id) return mi # }}} def get_rating(self, log, entry): # {{{ # log.debug(entry) ozon_rating = None try: xp_rating_template = u'boolean(.//div[contains(@class, "bStars") and contains(@class, "%s")])' rating = None if entry.xpath(xp_rating_template % 'm5'): rating = 5. elif entry.xpath(xp_rating_template % 'm4'): rating = 4. elif entry.xpath(xp_rating_template % 'm3'): rating = 3. elif entry.xpath(xp_rating_template % 'm2'): rating = 2. elif entry.xpath(xp_rating_template % 'm1'): rating = 1. if rating: # 'rating', A floating point number between 0 and 10 # OZON raion N of 5, calibre of 10, but there is a bug? in identify ozon_rating = float(rating) except: pass return ozon_rating # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None ozon_id = identifiers.get('ozon', None) if ozon_id is None: isbn = identifiers.get('isbn', None) if isbn is not None: ozon_id = self.cached_isbn_to_identifier(isbn) if ozon_id is not None: url = self.cached_identifier_to_cover_url(ozon_id) return url # }}} def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): # {{{ cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.debug('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return log.debug('Downloading cover from:', cached_url) try: cdata = self.browser.open_novisit(cached_url, timeout=timeout).read() if cdata: result_queue.put((self, cdata)) except Exception as e: log.exception(u'Failed to download cover from: %s' % cached_url) return as_unicode(e) # }}} def get_book_details(self, log, metadata, timeout, cachedPage): # {{{ from lxml import etree, html from calibre.ebooks.chardet import xml_to_unicode if not cachedPage: url = self.get_book_url(metadata.get_identifiers())[2] # log.debug(u'book_details_url', url) raw = self.browser.open_novisit(url, timeout=timeout).read() fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0]) else: fulldoc = cachedPage log.debug(u'book_details -> using cached page') fullString = etree.tostring(fulldoc) doc = fulldoc.xpath(u'//div[@class="bDetailPage"][1]')[0] # series Серия/Серии series_elem = doc.xpath(u'//div[contains(text(), "Сери")]') if series_elem: series_text_elem = series_elem[0].getnext() metadata.series = series_text_elem.xpath(u'.//a/text()')[0] log.debug(u'**Seria: ', metadata.series) isbn = None isbn_elem = doc.xpath(u'//div[contains(text(), "ISBN")]') if isbn_elem: isbn = isbn_elem[0].getnext().xpath(u'normalize-space(./text())') metadata.identifiers['isbn'] = isbn # get authors/editors if no authors are available authors_joined = ','.join(metadata.authors) if authors_joined == '' or authors_joined == "Unknown": authors_from_detail = [] editor_elem = doc.xpath(u'//div[contains(text(), "Редактор")]') if editor_elem: editor = editor_elem[0].getnext().xpath(u'.//a/text()')[0] authors_from_detail.append(editor + u' (ред.)') authors_elem = doc.xpath(u'//div[contains(text(), "Автор")]') if authors_elem: authors = authors_elem[0].getnext().xpath( u'.//a/text()') # list authors_from_detail.extend(authors) if len(authors_from_detail) > 0: metadata.authors = authors_from_detail cover = doc.xpath('.//img[contains(@class, "fullImage")]/@src')[0] metadata.ozon_cover_url = _translateToBigCoverUrl(cover) publishers = None publishers_elem = doc.xpath(u'//div[contains(text(), "Издатель")]') if publishers_elem: publishers_elem = publishers_elem[0].getnext() publishers = publishers_elem.xpath(u'.//a/text()')[0] if publishers: metadata.publisher = publishers displ_lang = None langs = None langs_elem = doc.xpath(u'//div[contains(text(), "зык")]') if langs_elem: langs_elem = langs_elem[0].getnext() langs = langs_elem.xpath(u'text()')[0].strip() if langs: lng_splt = langs.split(u',') if lng_splt: displ_lang = lng_splt[0].strip() # log.debug(u'displ_lang1: ', displ_lang) metadata.language = _translageLanguageToCode(displ_lang) # log.debug(u'Language: ', metadata.language) # can be set before from xml search response if not metadata.pubdate: pubdate_elem = doc.xpath(u'//div[contains(text(), "Год выпуска")]') if pubdate_elem: pubYear = pubdate_elem[0].getnext().xpath(u'text()')[0].strip() if pubYear: matcher = re.search(r'\d{4}', pubYear) if matcher: metadata.pubdate = toPubdate(log, matcher.group(0)) # log.debug(u'Pubdate: ', metadata.pubdate) # comments, from Javascript data beginning = fullString.find(u'FirstBlock') end = fullString.find(u'}', beginning) comments = unicode(fullString[beginning + 75:end - 1]).decode("unicode-escape") metadata.comments = replace_entities(comments, 'utf-8')
class Ehentai(Source): name = 'E-hentai Galleries' author = 'Wu yuan, cssxsh' version = (1, 1, 3) minimum_calibre_version = (2, 80, 0) description = _('Download metadata and cover from e-hentai.org.' 'Useful only for doujinshi.') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'tags', 'rating', 'publisher', 'identifier:ehentai' ]) supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = True EHentai_url = 'https://e-hentai.org/g/%s/%s/' ExHentai_url = 'https://exhentai.org/g/%s/%s/' options = ( Option( 'Use_Exhentai', 'bool', False, _('Use Exhentai'), _('If Use Exhentai is True, the plugin will search metadata on exhentai.' )), Option('ipb_member_id', 'string', None, _('ipb_member_id'), _('If Use Exhentai is True, please input your cookies.')), Option('ipb_pass_hash', 'string', None, _('ipb_pass_hash'), _('If Use Exhentai is True, please input your cookies.')), Option('igneous', 'string', None, _('igneous'), _('If Use Exhentai is True, please input your cookies.')), Option( 'Use_Proxy', 'bool', False, _('Use Proxy'), _('If Use Proxy is True, the plugin will search metadata by proxy.' )), Option( 'link', 'string', None, _('link'), # username:[email protected]:8888 _('If Use Proxy is True, please input your proxy. example: username:[email protected]:8888' )), ) config_help_message = ('<p>' + _( 'To Download Metadata from exhentai.org you must sign up' ' a free account and get the cookies of .exhentai.org.' ' If you don\'t have an account, you can <a href="%s">sign up</a>.') ) % 'https://forums.e-hentai.org/index.php' def __init__(self, *args, **kwargs): # {{{ Source.__init__(self, *args, **kwargs) self.config_exhentai() self.config_proxy() # }}} def config_exhentai(self): # {{{ ExHentai_Status = self.prefs['Use_Exhentai'] ExHentai_Cookies = [ { 'name': 'ipb_member_id', 'value': self.prefs['ipb_member_id'], 'domain': '.exhentai.org', 'path': '/' }, { 'name': 'ipb_pass_hash', 'value': self.prefs['ipb_pass_hash'], 'domain': '.exhentai.org', 'path': '/' }, { 'name': 'igneous', 'value': self.prefs['igneous'], 'domain': '.exhentai.org', 'path': '/' }, ] if ExHentai_Status is True: for cookie in ExHentai_Cookies: if cookie['value'] is None: ExHentai_Status = False break self.ExHentai_Status = ExHentai_Status self.ExHentai_Cookies = ExHentai_Cookies return # }}} def config_proxy(self): # {{{ Proxy_Status = self.prefs['Use_Proxy'] Proxy = {'https': self.prefs['link'], 'http': self.prefs['link']} self.Proxy_Status = Proxy_Status self.Proxy = Proxy # }}} def create_query(self, log, title=None, authors=None, identifiers={}, is_exhentai=False): # {{{ EHentai_SEARCH_URL = 'https://e-hentai.org/?' ExHentai_SEARCH_URL = 'https://exhentai.org/?' q = '' if title or authors: def build_term(type, parts): return ' '.join(x for x in parts) title_token = list(self.get_title_tokens(title)) if title_token: q = q + build_term('title', title_token) author_token = list( self.get_author_tokens(authors, only_first_author=True)) if author_token: q = q + (' ' if q != '' else '') + build_term( 'author', author_token) q = q.strip() if isinstance(q, unicode): q = q.encode('utf-8') if not q: return None q_dict = { 'f_doujinshi': 1, 'f_manga': 1, 'f_artistcg': 1, 'f_gamecg': 1, 'f_western': 1, 'f_non-h': 1, 'f_imageset': 1, 'f_cosplay': 1, 'f_asianporn': 1, 'f_misc': 1, 'f_search': q, 'f_apply': 'Apply+Filter', 'advsearch': 1, 'f_sname': 'on', 'f_sh': 'on', 'f_srdd': 2 } if is_exhentai is False: url = EHentai_SEARCH_URL + urlencode(q_dict) else: url = ExHentai_SEARCH_URL + urlencode(q_dict) return url # }}} def get_gallery_info(self, log, raw): # {{{ pattern = re.compile( r'https:\/\/(?:e-hentai\.org|exhentai\.org)\/g\/(?P<gallery_id>\d+)/(?P<gallery_token>\w+)/' ) results = re.findall(pattern, raw) if not results: log.exception('Failed to get gallery_id and gallery_token!') return None gidlist = [] for r in results: gidlist.append(list(r)) return gidlist # }}} def get_all_details(self, gidlist, log, abort, result_queue, timeout): # {{{ EHentai_API_url = 'https://api.e-hentai.org/api.php' ExHentai_API_url = 'https://exhentai.org/api.php' is_exhentai = self.ExHentai_Status use_proxy = self.Proxy_Status proxy = self.Proxy url = EHentai_API_url br = self.browser if is_exhentai is True: url = ExHentai_API_url if use_proxy is True: def proxy_bypass(hostname): log(hostname + ' by proxy') return True br.set_proxies(proxy, proxy_bypass) data = {"method": "gdata", "gidlist": gidlist, "namespace": 1} data = json.dumps(data) try: _raw = br.open_novisit(url, timeout=timeout) raw = _raw.read() except Exception as e: log.exception('Failed to make api request.', e) return gmetadatas = json.loads(raw)['gmetadata'] for relevance, gmetadata in enumerate(gmetadatas): try: ans = to_metadata(log, gmetadata, self.ExHentai_Status) if isinstance(ans, Metadata): ans.source_relevance = relevance db = ans.identifiers['ehentai'] if ans.has_ehentai_cover: self.cache_identifier_to_cover_url( db, ans.has_ehentai_cover) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: log.exception('Failed to get metadata for identify entry:', gmetadata) if abort.is_set(): break # }}} def get_book_url(self, identifiers): # {{{ db = identifiers.get('ehentai', None) d = {'0': False, '1': True} if db is not None: gid, token, s = re.split('_', db) ExHentai_Status = d[str(s)] if ExHentai_Status: url = self.ExHentai_url % (gid, token) else: url = self.EHentai_url % (gid, token) return ('ehentai', db, url) # }}} def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): # {{{ cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: return if abort.is_set(): return br = self.browser log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() if cdata: result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None db = identifiers.get('ehentai', None) if db is None: pass if db is not None: url = self.cached_identifier_to_cover_url(db) return url # }}} def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): # {{{ is_exhentai = self.ExHentai_Status use_proxy = self.Proxy_Status proxy = self.Proxy query = self.create_query(log, title=title, authors=authors, identifiers=identifiers, is_exhentai=is_exhentai) if not query: log.error('Insufficient metadata to construct query') return br = self.browser if use_proxy is True: def proxy_bypass(hostname): log(hostname + ' by proxy') return True br.set_proxies(proxy, proxy_bypass) if is_exhentai is True: for cookie in self.ExHentai_Cookies: br.set_cookie(name=cookie['name'], value=cookie['value'], domain=cookie['domain'], path=cookie['path']) try: _raw = br.open_novisit(query, timeout=timeout) raw = _raw.read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) if not raw and identifiers and title and authors and not abort.is_set( ): return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) if is_exhentai is True: try: 'https://exhentai.org/' in raw except Exception as e: log.error('The cookies for ExHentai is invalid.') log.error('Exhentai cookies:') log.error(self.ExHentai_Cookies) return gidlist = self.get_gallery_info(log, raw) if not gidlist: log.error('No result found.\n', 'query: %s' % query) return self.get_all_details(gidlist=gidlist, log=log, abort=abort, result_queue=result_queue, timeout=timeout)
class OverDrive(Source): name = 'Overdrive' version = (1, 0, 1) minimum_calibre_version = (2, 80, 0) description = _( 'Downloads metadata and covers from Overdrive\'s Content Reserve') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'series', 'series_index', 'languages', 'identifier:overdrive' ]) has_html_comments = True supports_gzip_transfer_encoding = False cached_cover_url_is_reliable = True options = (Option( 'get_full_metadata', 'bool', True, _('Download all metadata (slow)'), _('Enable this option to gather all metadata available from Overdrive.' )), ) config_help_message = '<p>' + _( 'Additional metadata can be taken from Overdrive\'s book detail' ' page. This includes a limited set of tags used by libraries, comments, language,' ' and the e-book ISBN. Collecting this data is disabled by default due to the extra' ' time required. Check the download all metadata option below to' ' enable downloading this data.') def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ovrdrv_id = identifiers.get('overdrive', None) isbn = identifiers.get('isbn', None) br = self.browser ovrdrv_data = self.to_ovrdrv_data(br, log, title, authors, ovrdrv_id) if ovrdrv_data: title = ovrdrv_data[8] authors = ovrdrv_data[6] mi = Metadata(title, authors) self.parse_search_results(ovrdrv_data, mi) if ovrdrv_id is None: ovrdrv_id = ovrdrv_data[7] if self.prefs['get_full_metadata']: self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log) if isbn is not None: self.cache_isbn_to_identifier(isbn, ovrdrv_id) result_queue.put(mi) return None # }}} def download_cover( self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): import mechanize cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return ovrdrv_id = identifiers.get('overdrive', None) br = self.browser req = mechanize.Request(cached_url) if ovrdrv_id is not None: referer = self.get_base_referer( ) + 'ContentDetails-Cover.htm?ID=' + ovrdrv_id req.add_header('referer', referer) log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(req, timeout=timeout).read() result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None ovrdrv_id = identifiers.get('overdrive', None) if ovrdrv_id is None: isbn = identifiers.get('isbn', None) if isbn is not None: ovrdrv_id = self.cached_isbn_to_identifier(isbn) if ovrdrv_id is not None: url = self.cached_identifier_to_cover_url(ovrdrv_id) return url # }}} def get_base_referer( self): # to be used for passing referrer headers to cover download choices = [ 'https://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/', 'https://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/', 'https://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/', 'https://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/', 'https://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/' ] return choices[random.randint(0, len(choices) - 1)] def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid): fix_slashes = re.compile(r'\\/') thumbimage = fix_slashes.sub('/', thumbimage) worldcatlink = fix_slashes.sub('/', worldcatlink) cover_url = re.sub(r'(?P<img>(Ima?g(eType-)?))200', r'\g<img>100', thumbimage) social_metadata_url = base_url + 'TitleInfo.aspx?ReserveID=' + reserveid + '&FormatID=' + formatid series_num = '' if not series: if subtitle: title = od_title + ': ' + subtitle else: title = od_title else: title = od_title m = re.search("([0-9]+$)", subtitle) if m: series_num = float(m.group(1)) return [ cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title ] def safe_query(self, br, query_url, post=''): ''' The query must be initialized by loading an empty search results page this page attempts to set a cookie that Mechanize doesn't like copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar ''' import mechanize goodcookies = br._ua_handlers['_cookies'].cookiejar clean_cj = mechanize.CookieJar() cookies_to_copy = [] for cookie in goodcookies: copied_cookie = copy.deepcopy(cookie) cookies_to_copy.append(copied_cookie) for copied_cookie in cookies_to_copy: clean_cj.set_cookie(copied_cookie) if post: br.open_novisit(query_url, post) else: br.open_novisit(query_url) br.set_cookiejar(clean_cj) def overdrive_search(self, br, log, q, title, author): import mechanize # re-initialize the cookiejar to so that it's clean clean_cj = mechanize.CookieJar() br.set_cookiejar(clean_cj) q_query = q + 'default.aspx/SearchByKeyword' q_init_search = q + 'SearchResults.aspx' # get first author as string - convert this to a proper cleanup function later author_tokens = list( self.get_author_tokens(author, only_first_author=True)) title_tokens = list( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)) xref_q = '' if len(author_tokens) <= 1: initial_q = ' '.join(title_tokens) xref_q = '+'.join(author_tokens) else: initial_q = ' '.join(author_tokens) for token in title_tokens: if len(xref_q) < len(token): xref_q = token log.error('Initial query is %s' % initial_q) log.error('Cross reference query is %s' % xref_q) q_xref = q + 'SearchResults.svc/GetResults?iDisplayLength=50&sSearch=' + xref_q query = '{"szKeyword":"' + initial_q + '"}' # main query, requires specific Content Type header req = mechanize.Request(q_query) req.add_header('Content-Type', 'application/json; charset=utf-8') br.open_novisit(req, query) # initiate the search without messing up the cookiejar self.safe_query(br, q_init_search) # get the search results object results = False iterations = 0 while results is False: iterations += 1 xreq = mechanize.Request(q_xref) xreq.add_header('X-Requested-With', 'XMLHttpRequest') xreq.add_header('Referer', q_init_search) xreq.add_header('Accept', 'application/json, text/javascript, */*') raw = br.open_novisit(xreq).read() for m in re.finditer( type('') (r'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)' ), raw): if int(m.group('totalrecords')) == 0: return '' elif int(m.group('displayrecords')) >= 1: results = True elif int(m.group('totalrecords')) >= 1 and iterations < 3: if xref_q.find('+') != -1: xref_tokens = xref_q.split('+') xref_q = xref_tokens[0] for token in xref_tokens: if len(xref_q) < len(token): xref_q = token # log.error('rewrote xref_q, new query is '+xref_q) else: xref_q = '' q_xref = q + 'SearchResults.svc/GetResults?iDisplayLength=50&sSearch=' + xref_q return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens) def sort_ovrdrv_results(self, raw, log, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None): close_matches = [] raw = re.sub(r'.*?\[\[(?P<content>.*?)\]\].*', r'[[\g<content>]]', raw) results = json.loads(raw) # log.error('raw results are:'+type('')(results)) # The search results are either from a keyword search or a multi-format list from a single ID, # sort through the results for closest match/format if results: for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: # log.error("this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series) if ovrdrv_id is not None and int(formatid) in [ 1, 50, 410, 900 ]: # log.error('overdrive id is not None, searching based on format type priority') return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) else: if creators: creators = creators.split(', ') # if an exact match in a preferred format occurs if ((author and creators and creators[0] == author[0]) or (not author and not creators)) and \ od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage: return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) else: close_title_match = False close_author_match = False for token in title_tokens: if od_title.lower().find(token.lower()) != -1: close_title_match = True else: close_title_match = False break for author in creators: for token in author_tokens: if author.lower().find(token.lower()) != -1: close_author_match = True else: close_author_match = False break if close_author_match: break if close_title_match and close_author_match and int( formatid) in [1, 50, 410, 900] and thumbimage: if subtitle and series: close_matches.insert( 0, self.format_results( reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) else: close_matches.append( self.format_results( reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) elif close_title_match and close_author_match and int( formatid) in [1, 50, 410, 900]: close_matches.append( self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) if close_matches: return close_matches[0] else: return '' else: return '' def overdrive_get_record(self, br, log, q, ovrdrv_id): import mechanize search_url = q + 'SearchResults.aspx?ReserveID={' + ovrdrv_id + '}' results_url = q + 'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc' # noqa # re-initialize the cookiejar to so that it's clean clean_cj = mechanize.CookieJar() br.set_cookiejar(clean_cj) # get the base url to set the proper session cookie br.open_novisit(q) # initialize the search self.safe_query(br, search_url) # get the results req = mechanize.Request(results_url) req.add_header('X-Requested-With', 'XMLHttpRequest') req.add_header('Referer', search_url) req.add_header('Accept', 'application/json, text/javascript, */*') raw = br.open_novisit(req) raw = type('')(list(raw)) clean_cj = mechanize.CookieJar() br.set_cookiejar(clean_cj) return self.sort_ovrdrv_results(raw, log, None, None, None, ovrdrv_id) def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None): q = base_url if ovrdrv_id is None: return self.overdrive_search(br, log, q, title, author) else: return self.overdrive_get_record(br, log, q, ovrdrv_id) def to_ovrdrv_data(self, br, log, title=None, author=None, ovrdrv_id=None): ''' Takes either a title/author combo or an Overdrive ID. One of these two must be passed to this function. ''' if ovrdrv_id is not None: with cache_lock: ans = ovrdrv_data_cache.get(ovrdrv_id, None) if ans: return ans elif ans is False: return None else: ovrdrv_data = self.find_ovrdrv_data(br, log, title, author, ovrdrv_id) else: try: ovrdrv_data = self.find_ovrdrv_data(br, log, title, author, ovrdrv_id) except: import traceback traceback.print_exc() ovrdrv_data = None with cache_lock: ovrdrv_data_cache[ ovrdrv_id] = ovrdrv_data if ovrdrv_data else False return ovrdrv_data if ovrdrv_data else False def parse_search_results(self, ovrdrv_data, mi): ''' Parse the formatted search results from the initial Overdrive query and add the values to the metadta. The list object has these values: [cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4], publisher[5], creators[6], reserveid[7], title[8]] ''' ovrdrv_id = ovrdrv_data[7] mi.set_identifier('overdrive', ovrdrv_id) if len(ovrdrv_data[3]) > 1: mi.series = ovrdrv_data[3] if ovrdrv_data[4]: try: mi.series_index = float(ovrdrv_data[4]) except: pass mi.publisher = ovrdrv_data[5] mi.authors = ovrdrv_data[6] mi.title = ovrdrv_data[8] cover_url = ovrdrv_data[0] if cover_url: self.cache_identifier_to_cover_url(ovrdrv_id, cover_url) def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from html5_parser import parse from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: return False pub_date = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath( "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]" ) if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = { 'english': 'eng', 'french': 'fra', 'german': 'deu', 'spanish': 'spa' }.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print("ebook isbn is "+type('')(ebook_isbn[0])) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None
class Amazon(Source): name = 'Amazon.com' description = _('Downloads metadata and covers from Amazon') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset(['title', 'authors', 'identifier:amazon', 'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate', 'languages', 'series']) has_html_comments = True supports_gzip_transfer_encoding = True AMAZON_DOMAINS = { 'com': _('US'), 'fr': _('France'), 'de': _('Germany'), 'uk': _('UK'), 'it': _('Italy'), 'jp': _('Japan'), 'es': _('Spain'), 'br': _('Brazil'), } options = ( Option('domain', 'choices', 'com', _('Amazon website to use:'), _('Metadata from Amazon will be fetched using this ' 'country\'s Amazon website.'), choices=AMAZON_DOMAINS), ) def __init__(self, *args, **kwargs): Source.__init__(self, *args, **kwargs) self.set_amazon_id_touched_fields() def test_fields(self, mi): ''' Return the first field from self.touched_fields that is null on the mi object ''' for key in self.touched_fields: if key.startswith('identifier:'): key = key.partition(':')[-1] if key == 'amazon': if self.domain != 'com': key += '_' + self.domain if not mi.has_identifier(key): return 'identifier: ' + key elif mi.is_null(key): return key @property def user_agent(self): # Pass in an index to random_user_agent() to test with a particular # user agent return random_user_agent() def save_settings(self, *args, **kwargs): Source.save_settings(self, *args, **kwargs) self.set_amazon_id_touched_fields() def set_amazon_id_touched_fields(self): ident_name = "identifier:amazon" if self.domain != 'com': ident_name += '_' + self.domain tf = [x for x in self.touched_fields if not x.startswith('identifier:amazon')] + [ident_name] self.touched_fields = frozenset(tf) def get_domain_and_asin(self, identifiers): for key, val in identifiers.iteritems(): key = key.lower() if key in ('amazon', 'asin'): return 'com', val if key.startswith('amazon_'): domain = key.split('_')[-1] if domain and domain in self.AMAZON_DOMAINS: return domain, val return None, None def get_book_url(self, identifiers): # {{{ domain, asin = self.get_domain_and_asin(identifiers) if domain and asin: url = None if domain == 'com': url = 'http://amzn.com/'+asin elif domain == 'uk': url = 'http://www.amazon.co.uk/dp/'+asin elif domain == 'br': url = 'http://www.amazon.com.br/dp/'+asin else: url = 'http://www.amazon.%s/dp/%s'%(domain, asin) if url: idtype = 'amazon' if domain == 'com' else 'amazon_'+domain return (idtype, asin, url) def get_book_url_name(self, idtype, idval, url): if idtype == 'amazon': return self.name return 'A' + idtype.replace('_', '.')[1:] # }}} @property def domain(self): x = getattr(self, 'testing_domain', None) if x is not None: return x domain = self.prefs['domain'] if domain not in self.AMAZON_DOMAINS: domain = 'com' return domain def clean_downloaded_metadata(self, mi): docase = ( mi.language == 'eng' or (mi.is_null('language') and self.domain in {'com', 'uk'}) ) if mi.title and docase: mi.title = fixcase(mi.title) mi.authors = fixauthors(mi.authors) if mi.tags and docase: mi.tags = list(map(fixcase, mi.tags)) mi.isbn = check_isbn(mi.isbn) def get_website_domain(self, domain): udomain = domain if domain == 'uk': udomain = 'co.uk' elif domain == 'jp': udomain = 'co.jp' elif domain == 'br': udomain = 'com.br' return udomain def create_query(self, log, title=None, authors=None, identifiers={}, # {{{ domain=None): from urllib import urlencode if domain is None: domain = self.domain idomain, asin = self.get_domain_and_asin(identifiers) if idomain is not None: domain = idomain # See the amazon detailed search page to get all options q = {'search-alias': 'aps', 'unfiltered': '1', } if domain == 'com': q['sort'] = 'relevanceexprank' else: q['sort'] = 'relevancerank' isbn = check_isbn(identifiers.get('isbn', None)) if asin is not None: q['field-keywords'] = asin elif isbn is not None: q['field-isbn'] = isbn else: # Only return book results q['search-alias'] = 'digital-text' if domain == 'br' else 'stripbooks' if title: title_tokens = list(self.get_title_tokens(title)) if title_tokens: q['field-title'] = ' '.join(title_tokens) if authors: author_tokens = self.get_author_tokens(authors, only_first_author=True) if author_tokens: q['field-author'] = ' '.join(author_tokens) if not ('field-keywords' in q or 'field-isbn' in q or ('field-title' in q)): # Insufficient metadata to make an identify query return None, None # magic parameter to enable Japanese Shift_JIS encoding. if domain == 'jp': q['__mk_ja_JP'] = u'カタカナ' if domain == 'jp': encode_to = 'Shift_JIS' else: encode_to = 'latin1' encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to, 'ignore')) for x, y in q.iteritems()]) url = 'http://www.amazon.%s/s/?'%self.get_website_domain(domain) + urlencode(encoded_q) return url, domain # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None domain, asin = self.get_domain_and_asin(identifiers) if asin is None: isbn = identifiers.get('isbn', None) if isbn is not None: asin = self.cached_isbn_to_identifier(isbn) if asin is not None: url = self.cached_identifier_to_cover_url(asin) return url # }}} def parse_results_page(self, root, domain): # {{{ from lxml.html import tostring matches = [] def title_ok(title): title = title.lower() bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )'] if self.domain == 'com': bad.extend(['(%s edition)' % x for x in ('spanish', 'german')]) for x in bad: if x in title: return False return True for div in root.xpath(r'//div[starts-with(@id, "result_")]'): links = div.xpath(r'descendant::a[@class="title" and @href]') if not links: # New amazon markup links = div.xpath('descendant::h3/a[@href]') for a in links: title = tostring(a, method='text', encoding=unicode) if title_ok(title): url = a.get('href') if url.startswith('/'): url = 'http://www.amazon.%s%s' % (self.get_website_domain(domain), url) matches.append(url) break if not matches: # This can happen for some user agents that Amazon thinks are # mobile/less capable for td in root.xpath( r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'): for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'): title = tostring(a, method='text', encoding=unicode) if title_ok(title): url = a.get('href') if url.startswith('/'): url = 'http://www.amazon.%s%s' % (self.get_website_domain(domain), url) matches.append(url) break # Keep only the top 5 matches as the matches are sorted by relevance by # Amazon so lower matches are not likely to be very relevant return matches[:5] # }}} def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode from lxml.html import tostring import html5lib testing = getattr(self, 'running_a_test', False) query, domain = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query') return br = self.browser if testing: print ('Using user agent for amazon: %s'%self.user_agent) try: raw = br.open_novisit(query, timeout=timeout).read().strip() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: log.error('Query malformed: %r'%query) return attr = getattr(e, 'args', [None]) attr = attr if attr else [None] if isinstance(attr[0], socket.timeout): msg = _('Amazon timed out. Try again later.') log.error(msg) else: msg = 'Failed to make identify query: %r'%query log.exception(msg) return as_unicode(msg) raw = clean_ascii_chars(xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]) if testing: import tempfile with tempfile.NamedTemporaryFile(prefix='amazon_results_', suffix='.html', delete=False) as f: f.write(raw.encode('utf-8')) print ('Downloaded html for results page saved in', f.name) matches = [] found = '<title>404 - ' not in raw if found: try: root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) except: msg = 'Failed to parse amazon page for query: %r'%query log.exception(msg) return msg errmsg = root.xpath('//*[@id="errorMessage"]') if errmsg: msg = tostring(errmsg, method='text', encoding=unicode).strip() log.error(msg) # The error is almost always a not found error found = False if found: matches = self.parse_results_page(root, domain) if abort.is_set(): return if not matches: if identifiers and title and authors: log('No matches found with identifiers, retrying using only' ' title and authors. Query: %r'%query) return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) log.error('No matches found with query: %r'%query) return workers = [Worker(url, result_queue, br, log, i, domain, self, testing=testing) for i, url in enumerate(matches)] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None # }}} def download_cover(self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return br = self.browser log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url)
class Moly_hu(Source): name = 'Moly_hu' description = _('Downloads metadata and covers from moly.hu') author = 'Hoffer Csaba & Kloon & fatsadt & otapi & Dezso' version = (1, 0, 9) minimum_calibre_version = (0, 8, 0) capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'identifier:isbn', 'identifier:moly_hu', 'tags', 'comments', 'rating', 'series', 'series_index', 'publisher', 'pubdate', 'language', 'languages' ]) has_html_comments = False supports_gzip_transfer_encoding = False can_get_multiple_covers = True KEY_MAX_BOOKS = 'max_books' KEY_MAX_COVERS = 'max_covers' options = (Option( KEY_MAX_BOOKS, 'number', 3, _('Maximum number of books to get'), _('The maximum number of books to process from the moly.hu search result' )), Option( KEY_MAX_COVERS, 'number', 5, _('Maximum number of covers to get'), _('The maximum number of covers to process for the chosen book' ))) BASE_URL = 'https://moly.hu' BOOK_URL = BASE_URL + '/konyvek/' SEARCH_URL = BASE_URL + '/kereses?utf8=%E2%9C%93&q=' def create_query(self, log, title=None, authors=None, identifiers={}): isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: return Moly_hu.SEARCH_URL + isbn if title is not None: search_title = quote(title.encode('utf-8')) else: search_title = '' if authors is not None: search_author = quote(authors[0].encode('utf-8')) else: search_author = '' search_page = Moly_hu.SEARCH_URL + \ '%s+%s' % (search_author, search_title) return search_page def get_cached_cover_url(self, identifiers): url = None moly_id = identifiers.get('moly_hu', None) if moly_id is None: isbn = check_isbn(identifiers.get('isbn', None)) if isbn is not None: moly_id = self.cached_isbn_to_identifier(isbn) if moly_id is not None: url = self.cached_identifier_to_cover_url(moly_id) return url def identify(self, log, result_queue, abort, title, authors, identifiers={}, timeout=30): ''' Note this method will retry without identifiers automatically if no match is found with identifiers. ''' matches = [] moly_id = identifiers.get('moly_hu', None) log.info(u'\nTitle:%s\nAuthors:%s\n' % (title, authors)) br = browser() if moly_id: matches.append(Moly_hu.BOOK_URL + moly_id) else: query = self.create_query(log, title=title, authors=authors, identifiers=identifiers) if query is None: log.error('Insufficient metadata to construct query') return try: log.info('Querying: %s' % query) response = br.open(query) except Exception as e: if callable(getattr(e, 'getcode', None)) and e.getcode() == 404: log.info('Failed to find match for ISBN: %s' % isbn) else: err = 'Failed to make identify query: %r' % query log.exception(err) return as_unicode(e) try: raw = response.read().strip() raw = raw.decode('utf-8', errors='replace') if not raw: log.error('Failed to get raw result for query: %r' % query) return root = fromstring(clean_ascii_chars(raw)) except: msg = 'Failed to parse moly.hu page for query: %r' % query log.exception(msg) return msg isbn = check_isbn(identifiers.get('isbn', None)) self._parse_search_results(log, title, authors, root, matches, timeout, isbn) if abort.is_set(): return if not matches: log.error('No matches found with query: %r' % query) if identifiers and title and authors: log.info( 'No matches found with identifiers, retrying using only' ' title and authors') return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) elif title and authors and title != title.split("(")[0]: log.info( 'No matches found with authors and title try removing () part from title, and search by title and author' ) tit = title.split("(")[0] return self.identify(log, result_queue, abort, title=tit, authors=authors, timeout=timeout) elif title and authors: log.info( 'No matches found with authors and title, retrying using only title' ) return self.identify(log, result_queue, abort, title=title, authors=None, timeout=timeout) return from calibre_plugins.moly_hu.worker import Worker workers = [ Worker(url, result_queue, br, log, i, self) for i, url in enumerate(matches) ] for w in workers: w.start() time.sleep(0.1) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break return None def _parse_search_results(self, log, orig_title, orig_authors, root, matches, timeout, isbn): max_results = self.prefs[Moly_hu.KEY_MAX_BOOKS] results = root.xpath('//a[@class="book_selector"]') log.info('Found %d possible books (max: %d)' % (len(results), max_results)) i = 0 for result in results: book_urls = result.xpath('@href') if isbn is None: etree.strip_tags(result, 'strong') author_n_title = result.text author_n_titles = author_n_title.split(':', 1) author = author_n_titles[0].strip(' \r\n\t') title = author_n_titles[1].strip(' \r\n\t') log.info('Orig: %s, target: %s' % (self.strip_accents(orig_title), self.strip_accents(title))) if orig_title: if orig_title.lower() not in title.lower( ) and self.strip_accents( orig_title) not in self.strip_accents(title): continue if orig_authors: author1 = orig_authors[0] authorsplit = author1.split(" ") author2 = author1 if len(authorsplit) > 1: author2 = '%s %s' % (authorsplit[1], authorsplit[0]) if author1.lower() not in author.lower( ) and self.strip_accents( author1) not in self.strip_accents( author) and author2.lower( ) not in author.lower() and self.strip_accents( author2) not in self.strip_accents(author): continue for book_url in book_urls: result_url = Moly_hu.BASE_URL + book_url if (result_url not in matches): matches.append(result_url) i += 1 if (i >= max_results): return if i == 0: for result in results: book_urls = result.xpath('@href') for book_url in book_urls: result_url = Moly_hu.BASE_URL + book_url if (result_url not in matches): matches.append(result_url) i += 1 if (i >= max_results): return def strip_accents(self, s): if s is None: return None symbols = (u"öÖüÜóÓőŐúÚéÉáÁűŰíÍ", u"oOuUoOoOuUeEaAuUiI") tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)]) return s.translate(tr).lower() def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): if not title: return urls = self.get_image_urls(title, authors, identifiers, log, abort, timeout) self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log) def get_image_urls(self, title, authors, identifiers, log, abort, timeout): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is not None: return cached_url log.info('No cover found') return []
class GoogleImages(Source): name = 'Google Images' version = (1, 0, 0) minimum_calibre_version = (2, 80, 0) description = _( 'Downloads covers from a Google Image search. Useful to find larger/alternate covers.' ) capabilities = frozenset(['cover']) can_get_multiple_covers = True supports_gzip_transfer_encoding = True options = ( Option( 'max_covers', 'number', 5, _('Maximum number of covers to get'), _('The maximum number of covers to process from the Google search result' )), Option('size', 'choices', 'svga', _('Cover size'), _('Search for covers larger than the specified size'), choices=OrderedDict(( ( 'any', _('Any size'), ), ( 'l', _('Large'), ), ( 'qsvga', _('Larger than %s') % '400x300', ), ( 'vga', _('Larger than %s') % '640x480', ), ( 'svga', _('Larger than %s') % '600x800', ), ( 'xga', _('Larger than %s') % '1024x768', ), ( '2mp', _('Larger than %s') % '2 MP', ), ( '4mp', _('Larger than %s') % '4 MP', ), ))), ) def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): if not title: return timeout = max(60, timeout) # Needs at least a minute title = ' '.join(self.get_title_tokens(title)) author = ' '.join(self.get_author_tokens(authors)) urls = self.get_image_urls(title, author, log, abort, timeout) self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log) @property def user_agent(self): return USER_AGENT def get_image_urls(self, title, author, log, abort, timeout): from calibre.utils.cleantext import clean_ascii_chars from urllib import urlencode import html5lib import json from collections import OrderedDict ans = OrderedDict() br = self.browser q = urlencode({ 'as_q': ('%s %s' % (title, author)).encode('utf-8') }).decode('utf-8') sz = self.prefs['size'] if sz == 'any': sz = '' elif sz == 'l': sz = 'isz:l,' else: sz = 'isz:lt,islt:%s,' % sz # See https://www.google.com/advanced_image_search to understand this # URL scheme url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format( q, sz) log('Search URL: ' + url) raw = br.open(url).read().decode('utf-8') root = html5lib.parse(clean_ascii_chars(raw), treebuilder='lxml', namespaceHTMLElements=False) for div in root.xpath('//div[@class="rg_meta"]'): try: data = json.loads(div.text) except Exception: continue if 'ou' in data: ans[data['ou']] = True return list(ans.iterkeys())
class OverDrive(Source): name = 'Overdrive' version = (1, 0, 0) minimum_calibre_version = (2, 80, 0) description = _( 'Downloads metadata and covers from Overdrive\'s Content Reserve') capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'series', 'series_index', 'languages', 'identifier:overdrive' ]) has_html_comments = True supports_gzip_transfer_encoding = False cached_cover_url_is_reliable = True options = (Option( 'get_full_metadata', 'bool', True, _('Download all metadata (slow)'), _('Enable this option to gather all metadata available from Overdrive.' )), ) config_help_message = '<p>' + _( 'Additional metadata can be taken from Overdrive\'s book detail' ' page. This includes a limited set of tags used by libraries, comments, language,' ' and the e-book ISBN. Collecting this data is disabled by default due to the extra' ' time required. Check the download all metadata option below to' ' enable downloading this data.') def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ovrdrv_id = identifiers.get('overdrive', None) isbn = identifiers.get('isbn', None) br = self.browser ovrdrv_data = self.to_ovrdrv_data(br, log, title, authors, ovrdrv_id) if ovrdrv_data: title = ovrdrv_data[8] authors = ovrdrv_data[6] mi = Metadata(title, authors) self.parse_search_results(ovrdrv_data, mi) if ovrdrv_id is None: ovrdrv_id = ovrdrv_data[7] if self.prefs['get_full_metadata']: self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log) if isbn is not None: self.cache_isbn_to_identifier(isbn, ovrdrv_id) result_queue.put(mi) return None # }}} def download_cover( self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False): import mechanize cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return ovrdrv_id = identifiers.get('overdrive', None) br = self.browser req = mechanize.Request(cached_url) if ovrdrv_id is not None: referer = self.get_base_referer( ) + 'ContentDetails-Cover.htm?ID=' + ovrdrv_id req.add_header('referer', referer) log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(req, timeout=timeout).read() result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None ovrdrv_id = identifiers.get('overdrive', None) if ovrdrv_id is None: isbn = identifiers.get('isbn', None) if isbn is not None: ovrdrv_id = self.cached_isbn_to_identifier(isbn) if ovrdrv_id is not None: url = self.cached_identifier_to_cover_url(ovrdrv_id) return url # }}} def get_base_referer( self): # to be used for passing referrer headers to cover download choices = [ 'https://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/', 'https://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/', 'https://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/', 'https://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/', 'https://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/' ] return choices[random.randint(0, len(choices) - 1)] def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid): fix_slashes = re.compile(r'\\/') thumbimage = fix_slashes.sub('/', thumbimage) worldcatlink = fix_slashes.sub('/', worldcatlink) cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', thumbimage) social_metadata_url = base_url + 'TitleInfo.aspx?ReserveID=' + reserveid + '&FormatID=' + formatid series_num = '' if not series: if subtitle: title = od_title + ': ' + subtitle else: title = od_title else: title = od_title m = re.search("([0-9]+$)", subtitle) if m: series_num = float(m.group(1)) return [ cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title ] def safe_query(self, br, query_url, post=''): ''' The query must be initialized by loading an empty search results page this page attempts to set a cookie that Mechanize doesn't like copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar ''' import mechanize goodcookies = br._ua_handlers['_cookies'].cookiejar clean_cj = mechanize.CookieJar() cookies_to_copy = [] for cookie in goodcookies: copied_cookie = copy.deepcopy(cookie) cookies_to_copy.append(copied_cookie) for copied_cookie in cookies_to_copy: clean_cj.set_cookie(copied_cookie) if post: br.open_novisit(query_url, post) else: br.open_novisit(query_url) br.set_cookiejar(clean_cj) def overdrive_search(self, br, log, q, title, author): import mechanize # re-initialize the cookiejar to so that it's clean clean_cj = mechanize.CookieJar() br.set_cookiejar(clean_cj) q_query = q + 'default.aspx/SearchByKeyword' q_init_search = q + 'SearchResults.aspx' # get first author as string - convert this to a proper cleanup function later author_tokens = list( self.get_author_tokens(author, only_first_author=True)) title_tokens = list( self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True)) xref_q = '' if len(author_tokens) <= 1: initial_q = ' '.join(title_tokens) xref_q = '+'.join(author_tokens) else: initial_q = ' '.join(author_tokens) for token in title_tokens: if len(xref_q) < len(token): xref_q = token log.error('Initial query is %s' % initial_q) log.error('Cross reference query is %s' % xref_q) q_xref = q + 'SearchResults.svc/GetResults?iDisplayLength=50&sSearch=' + xref_q query = '{"szKeyword":"' + initial_q + '"}' # main query, requires specific Content Type header req = mechanize.Request(q_query) req.add_header('Content-Type', 'application/json; charset=utf-8') br.open_novisit(req, query) # initiate the search without messing up the cookiejar self.safe_query(br, q_init_search) # get the search results object results = False iterations = 0 while results is False: iterations += 1 xreq = mechanize.Request(q_xref) xreq.add_header('X-Requested-With', 'XMLHttpRequest') xreq.add_header('Referer', q_init_search) xreq.add_header('Accept', 'application/json, text/javascript, */*') raw = br.open_novisit(xreq).read() for m in re.finditer( ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw): if int(m.group('totalrecords')) == 0: return '' elif int(m.group('displayrecords')) >= 1: results = True elif int(m.group('totalrecords')) >= 1 and iterations < 3: if xref_q.find('+') != -1: xref_tokens = xref_q.split('+') xref_q = xref_tokens[0] for token in xref_tokens: if len(xref_q) < len(token): xref_q = token # log.error('rewrote xref_q, new query is '+xref_q) else: xref_q = '' q_xref = q + 'SearchResults.svc/GetResults?iDisplayLength=50&sSearch=' + xref_q return self.sort_ovrdrv_results(raw, log, title, title_tokens, author, author_tokens)
class Douban(Source): name = 'Douban Books' author = 'Li Fanxi, xcffl, jnozsc' version = (3, 1, 0) minimum_calibre_version = (2, 80, 0) description = _( 'Downloads metadata and covers from Douban.com. ' 'Useful only for Chinese language books.' ) capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher', 'identifier:isbn', 'rating', 'identifier:douban' ]) # language currently disabled supports_gzip_transfer_encoding = True cached_cover_url_is_reliable = True DOUBAN_API_KEY = '0df993c66c0c636e29ecbb5344252a4a' DOUBAN_API_URL = 'https://api.douban.com/v2/book/search' DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/' options = ( Option( 'include_subtitle_in_title', 'bool', True, _('Include subtitle in book title:'), _('Whether to append subtitle in the book title.') ), ) def to_metadata(self, browser, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow douban_id = entry_.get('id') title = entry_.get('title') description = entry_.get('summary') # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get('publisher') isbn = entry_.get('isbn13') # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get('pubdate') authors = entry_.get('author') book_tags = entry_.get('tags') rating = entry_.get('rating') cover_url = entry_.get('images', {}).get('large') series = entry_.get('series') if not authors: authors = [_('Unknown')] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {'douban': douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(''), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = [tag['name'] for tag in book_tags] # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings if rating: try: mi.rating = float(rating['average']) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u # Series if series: mi.series = series['title'] return mi # }}} def get_book_url(self, identifiers): # {{{ db = identifiers.get('douban', None) if db is not None: return ('douban', db, self.DOUBAN_BOOK_URL % db) # }}} def create_query(self, log, title=None, authors=None, identifiers={}): # {{{ try: from urllib.parse import urlencode except ImportError: from urllib import urlencode SEARCH_URL = 'https://api.douban.com/v2/book/search?count=10&' ISBN_URL = 'https://api.douban.com/v2/book/isbn/' SUBJECT_URL = 'https://api.douban.com/v2/book/' q = '' t = None isbn = check_isbn(identifiers.get('isbn', None)) subject = identifiers.get('douban', None) if isbn is not None: q = isbn t = 'isbn' elif subject is not None: q = subject t = 'subject' elif title or authors: def build_term(prefix, parts): return ' '.join(x for x in parts) title_tokens = list(self.get_title_tokens(title)) if title_tokens: q += build_term('title', title_tokens) author_tokens = list( self.get_author_tokens(authors, only_first_author=True) ) if author_tokens: q += ((' ' if q != '' else '') + build_term('author', author_tokens)) t = 'search' q = q.strip() if isinstance(q, type(u'')): q = q.encode('utf-8') if not q: return None url = None if t == "isbn": url = ISBN_URL + q elif t == 'subject': url = SUBJECT_URL + q else: url = SEARCH_URL + urlencode({ 'q': q, }) if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '': if t == "isbn" or t == "subject": url = url + "?apikey=" + self.DOUBAN_API_KEY else: url = url + "&apikey=" + self.DOUBAN_API_KEY return url # }}} def download_cover( self, log, result_queue, abort, # {{{ title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False ): cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify( log, rq, abort, title=title, authors=authors, identifiers=identifiers ) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort( key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers ) ) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return br = self.browser log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() if cdata: result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url) # }}} def get_cached_cover_url(self, identifiers): # {{{ url = None db = identifiers.get('douban', None) if db is None: isbn = identifiers.get('isbn', None) if isbn is not None: db = self.cached_isbn_to_identifier(isbn) if db is not None: url = self.cached_identifier_to_cover_url(db) return url # }}} def get_all_details( self, br, log, entries, abort, # {{{ result_queue, timeout ): for relevance, i in enumerate(entries): try: ans = self.to_metadata(br, log, i, timeout) if isinstance(ans, Metadata): ans.source_relevance = relevance db = ans.identifiers['douban'] for isbn in getattr(ans, 'all_isbns', []): self.cache_isbn_to_identifier(isbn, db) if ans.has_douban_cover: self.cache_identifier_to_cover_url(db, ans.has_douban_cover) self.clean_downloaded_metadata(ans) result_queue.put(ans) except: log.exception('Failed to get metadata for identify entry:', i) if abort.is_set(): break # }}} def identify( self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30 ): import json query = self.create_query( log, title=title, authors=authors, identifiers=identifiers ) if not query: log.error('Insufficient metadata to construct query') return br = self.browser try: raw = br.open_novisit(query, timeout=timeout).read() except Exception as e: log.exception('Failed to make identify query: %r' % query) return as_unicode(e) try: j = json.loads(raw) except Exception as e: log.exception('Failed to parse identify results') return as_unicode(e) if 'books' in j: entries = j['books'] else: entries = [] entries.append(j) if not entries and identifiers and title and authors and \ not abort.is_set(): return self.identify( log, result_queue, abort, title=title, authors=authors, timeout=timeout ) # There is no point running these queries in threads as douban # throttles requests returning 403 Forbidden errors self.get_all_details(br, log, entries, abort, result_queue, timeout) return None
class noosfere(Source): # see https://manual.calibre-ebook.com/fr/plugins.html#calibre.ebooks.metadata.sources.base.Source # and https://manual.calibre-ebook.com/fr/_modules/calibre/ebooks/metadata/sources/base.html#Source name = 'noosfere DB' description = _( 'Source extention: downloads and sets metadata from noosfere.org for selected volumes' ) author = 'Louis Richard Pirlet' version = (0, 9, 0) minimum_calibre_version = (5, 11, 0) ID_NAME = 'noosfere' capabilities = frozenset(['identify', 'cover']) touched_fields = frozenset([ 'title', 'authors', 'identifier:isbn', 'identifier:nsfr_id', 'languages', 'comments', 'publisher', 'pubdate', 'series', 'tags' ]) has_html_comments = True supports_gzip_transfer_encoding = True # Since the noosfere is written in french for french talking poeple, I # took the liberty to write the following information in french. I will # comment with a translation in the english language. #config help message: noosfere is a database that presents information #about French books, tagged as science-fiction. Those informations span #from author to films made of the books, including translators, #illustrators, critics... and of course there work. The book that were #published several time are exposed as a "volume". Each of those volumes #share the authors and the book content, they MAY share, or not, the #cover, the editor, the editor's collection and the associated order #number, the resume, the critics,etc.... The choice of the volume is done #by the program. One may somewhat influence the choice through the dialog #box `priorité de tri´. On the other hand, there is no offical way to #programmaticaly update a custom column. So There is a tick box that will #push the information along with the publisher. Please read the doc to #understand how to put it back later in the right place with a right format. config_help_message = '<p>' + _( " noosfere est une base de donnée qui propose des informations" " à propos des ouvrages, de genre science fiction, disponibles en langue française." " Ces informations vont de l'auteur aux films produits sur base de l'ouvrage en" " passant par les auteurs, les traducteurs, les illustrateurs, les critiquess..." " et bien sur, leurs oeuvres. Les livres qui ont été publiés plusieurs fois" " sont repris chacun sous un volume dont est exposé l'ISBN, la date de dépot legal" " (repris sous la date de publication, souvent méconnue), la couverture, l'éditeur," " la collection de l'editeur et son numèro d'ordre. Le choix, programmé, du volume" " est quelque peu paramétrable par la boite de dialogue `priorité de tri´. " " D'autre part, il n'existe pas de moyens officiels de remplir une colonne définie" " par l'utilisateur. Pour rester dans les clous, je propose de remplir le champs" " de l'editeur avec, conjointement à celui-ci, la collection et son numero d'ordre." " Une petite procédure, décrite dans la doc devrait remettre tout en ordre." ) # priority handling, a choice box that propose to set the priority over # the oldest published volume with a preference for an ISBN balanced for a maximum of comments # the latest published volume with a preference for an ISBN balanced for a maximum of comments # the oldest balanced for a maximum of comments # the latest balanced for a maximum of comments # the very oldest # the very latest # note that the selected volume will have the most represented editor # (if editor x reedited 4 time the book, and editor Y only once, # editor x will certainly be selected) # see algorithm explanation in worker.py 'ret_top_vol_indx(self, url, book_title)' PRIORITY_HANDLING = { '0_oldest_with_isbn': _("le plus ancien pondéré, préfère un isbn"), '1_latest_with_isbn': _("le plus récent pondéré, préfère un isbn"), '2_oldest': _("un plus ancien pondéré"), '3_latest': _("un plus recent pondéré"), '4_very_oldest': _("vraiment le plus ancien"), '5_very_latest': _("vraiment le plus recent") } options = ( Option( 'fat_publisher', 'bool', False, _( "Ajoute collection et son numéro d'ordre au champ èditeur" ), # add the editor's collection and the associated order number to the publisher field _("Cochez cette case pour ajouter la collection et son numéro d'ordre au champs de l'éditeur." "Voir LIS-MOI editeur_collection_seriel-code.txt" ) # check this box to enable... see README publisher_collection_seriel-code.txt ), Option( 'debug_level', 'number', 0, _('Loquacité du journal, de 0 à 7'), # verbosity of the log _('Le niveau de loquacité. O un minimum de rapport, 1 rapport etendu de __init__,' # the level of verbosity. value 0 will output the minimum, ' 2 rapport étendu de worker, 4 rapport etendu des annexes... La somme 3, 5 ou 7' # 1 debug messages of __init__, 2 debug messages of worker ' peut etre introduite. Ainsi 7 donne un maximun de rapport. Note: ce sont les 3' # 4 debug level of accessory code... 3, 5 or 7 is the sum ' derniers bits de debug_level en notation binaire' ) # of the value defined above. In fact it is a bitwise flag ), # spread over the last 3 bits of debug_level Option( 'priority_handling', 'choices', '0_oldest_with_isbn', _('priorité de tri:'), _("Priorité de tri du volume." ), # how to push the priority over the choice of the volume choices=PRIORITY_HANDLING), Option( 'requested_editor', 'string', None, _("impose un éditeur"), # impose a publisher _("le volume sera choisi chez l'éditeur le plus representé... SAUF:" # the volume is picked-up from the most prevalent publisher " Remplir ce champ pour forcer un éditeur defini... DOIT" # EXCEPTED: fill this field to force the publisher wanted " ETRE UN MATCH PARFAIT sinon le volume sera choisi sans tenir compte" # MUST BE A PERFECT MATCH else the volume will ne picked-up " de l'éditeur.") # without consideration to the publisher ), ) # this defines a method to access both the code and the data in the object @property def priority_handling(self): x = getattr(self, 'prio_handling', None) if x is not None: return x prio_handling = self.prefs['priority_handling'] if prio_handling not in self.PRIORITY_HANDLING: prio_handling = sorted( self.PRIORITY_HANDLING.items() )[0] # sort the dict to make a list and select first item (that should be the default) return prio_handling @property def extended_publisher(self): x = getattr(self, 'ext_pub', None) if x is not None: return x ext_pub = self.prefs.get('fat_publisher', False) return ext_pub @property def dbg_lvl(self): x = getattr(self, 'dl', None) if x is not None: return x dl = self.prefs.get('debug_level', False) return dl @property def must_be_editor(self): x = getattr(self, 'te', None) if x is not None: return x te = self.prefs.get('requested_editor', None) return te # copied from other working metadata source (thanks to David Forrester and the Kobo Books Metadata source) def get_cached_cover_url(self, identifiers): # I guess this routine returns an url that was discovered somewhere else and put into cache # probably using cache_identifier_to_cover_url in the worket.py # as ISBN is missing sometime in noosfere # as noosfere does not provide any proprietary id # I will use nsfr_id, a combination of bk_<significant part of book_url>_vl_<significant part of vol_url> # this should allow to go directly to the book page (that could be the vol page if there is only one vol for the book) # url = None nsfr_id = identifiers.get('nsfr_id', None) if nsfr_id is None: isbn = identifiers.get('isbn', None) if isbn is not None: nsfr_id = self.cached_isbn_to_identifier(isbn) if nsfr_id is not None: url = self.cached_identifier_to_cover_url(nsfr_id) return url def ret_author_index(self, log, br, authors): # Trouve la reference de l'auteur dans la soupe de noosfere # retourne author_index, un dictionnaire avec key=AUTEUR, val=href # L'idée est de renvoyer UNE seule reference... trouver l'auteur est primordial si isbn is indisponible # # Find author references in the soup produced by noosfere, return author_index a dictionary with key=author, val=href # the idea is to find ONE single reference... to get the author is important if isbn is unavailable # debug = self.dbg_lvl & 1 log.info("\nIn ret_author_index(soup)") if debug: log.info("authors : ", authors) all_author_index = {} author_index = [] # try to get a short list of authors using "MOTS-CLEFS" match for j in range(len(authors)): rkt = { "Mots": authors[j], "auteurs": "auteurs", "ModeMoteur": "MOTS-CLEFS", "ModeRecherche": "AND", "recherche": "1", "Envoyer": "Envoyer" } url = "https://www.noosfere.org/livres/noosearch.asp" soup = ret_soup(log, self.dbg_lvl, br, url, rkt=rkt)[0] tmp_ai = soup.select('a[href*="auteur.asp"]') if len(tmp_ai): for i in range(len(tmp_ai)): url_author, author, perta = tmp_ai[i]["href"], tmp_ai[ i].text, tmp_ai[i].find_previous('tr').select_one( 'td').text ratio = SM( None, ret_clean_text(log, self.dbg_lvl, author, swap=True), authors[j]).ratio() if debug: log.info("pertinence : ", perta, end=" ; ") log.info("SM.ratio : {:.3f}".format(ratio), end=" ; ") log.info("url_author : ", url_author, end=" ; ") log.info("authors[j] : ", authors[j], end=" ; ") log.info("author : ", ret_clean_text(log, self.dbg_lvl, author)) if ratio >= .6: all_author_index[url_author] = [ratio, author] if not len( all_author_index ): # failed the short list, let's go for the long list using "LITTERAL" match if debug: log.info("exact match failed, trying fuzzy match") # return self.ret_author_index(self, log, br, authors, ModeMoteur="LITTERAL") # ca marche pas... ret_author_index() got multiple values for argument 'ModeMoteur' # this is NOT a function but a class method # it is possible to move the common part of this code below, but my mind refuses to understand the change # when debugging... so duplicate the code (maybe an optimiseur later will make it... m'en fout) for j in range(len(authors)): rkt = { "Mots": authors[j], "auteurs": "auteurs", "ModeMoteur": "LITTERAL", "ModeRecherche": "AND", "recherche": "1", "Envoyer": "Envoyer" } url = "https://www.noosfere.org/livres/noosearch.asp" soup = ret_soup(log, self.dbg_lvl, br, url, rkt=rkt)[0] tmp_ai = soup.select('a[href*="auteur.asp"]') if len(tmp_ai): for i in range(len(tmp_ai)): url_author, author, perta = tmp_ai[i][ "href"], tmp_ai[i].text, tmp_ai[ i].find_previous('tr').select_one( 'td').text ratio = SM( None, ret_clean_text(log, self.dbg_lvl, author, swap=True), authors[j]).ratio() if debug: log.info("pertinence : ", perta, end=" ; ") log.info("SM.ratio : {:.3f}".format(ratio), end=" ; ") log.info("url_author : ", url_author, end=" ; ") log.info("authors[j] : ", authors[j], end=" ; ") log.info( "author : ", ret_clean_text(log, self.dbg_lvl, author)) if ratio >= .6: all_author_index[url_author] = [ratio, author] sorted_author_index = dict( sorted(all_author_index.items(), key=lambda x: x[1][0], reverse=True)) if debug: log.info("sorted_author_index :\n", sorted_author_index) # With python 3.6 onward, the standard dict type maintains insertion order by default. # Python 3.7 elevates this implementation detail to a language specification, # noosfere sort the hightest pertinence first (the most probable author comes out first) # so, I have no need to sort on pertinence field (would be different for calibre below Version 5) # # we only consider those with the highest pertinence, we limit to when the pertinence drops to less than half of the maximum # count = 0 for key, ref in sorted_author_index.items(): count += 1 url_author, ratio, name_author = key, ref[0], ref[1] author_index.append(url_author) if debug: log.info("ratio : ", ratio, end=" ; ") log.info("author : ", name_author, end=" ; ") log.info("url_author : ", url_author, end=" ; ") log.info("count : ", count) # log.info("author_index : ",author_index) # may be long if count == 8: break if debug: log.info('return from ret_author_index') return author_index def ret_book_per_author_index(self, log, br, author_index, title, book_index): # Find the books references of a known author from the returned soup for noosfere # returns a dict "book_per_author_index{}" with key as title and val as the link to the book # Idea is to send back a few references that hopefully contains the title expected # # Trouver la reference des livres d'un auteur connu dans la soupe produite par noosfere # retourne "book_per_author_index{}", un dictionnaire avec key=titre, val=href # L'idée est de renvoyer serie de reference, dont on extrait les livres proches du titre de calibre # # now that we have a list of authors, let's get all the books associated with them # The "book_per_author_index" dictionnary will contain all book's references... # If a book has a common url it will be overwritten by the following author, ensuring a list of unique books # debug = self.dbg_lvl & 1 log.info( "\nIn ret_book_per_author_index(self, log, br, author_index, title, book_index)" ) if debug: log.info("author_index : ", author_index) log.info("title : ", title) log.info("book_index : ", book_index) book_per_author_index = {} unsorted_book_index = {} for i in range(len(author_index)): rqt = author_index[i] + "&Niveau=livres" url = "https://www.noosfere.org" + rqt soup = ret_soup(log, self.dbg_lvl, br, url)[0] tmp_bpai = soup.select('a[href*="ditionsLivre.asp"]') for i in range(len(tmp_bpai)): book_title = tmp_bpai[i].text.lower() book_url = (tmp_bpai[i]["href"].replace( './', '/livres/').split('&'))[0] ratio = SM(None, title, ret_clean_text(log, self.dbg_lvl, book_title)).ratio() if debug: log.info("SM.ratio : {:.3f}".format(ratio), end=" ; ") log.info("book_url : ", book_url, end=" ; ") log.info('tmp_bpai[i]["href"] : ', tmp_bpai[i]["href"], end=" ; ") log.info("book_title : ", book_title) if ratio > .6: unsorted_book_index[ratio] = [book_url, "", book_title] if ratio == 1: unsorted_book_index = {} unsorted_book_index[ratio] = [book_url, "", book_title] break # we have a perfect match no need to go further in the author books # and I know it could cause problem iff several authors produce an identical title sorted_book_index = dict( sorted(unsorted_book_index.items(), reverse=True)) if debug: log.info("sorted bySM.ratio") for key, ref in sorted_book_index.items(): if debug: log.info("SM.ratio : {:.3f}".format(key), end=" ; ") log.info("book_url : ", ref[0], end=" ; ") log.info("book_title : ", ref[2]) book_index[ref[0]] = ref[2] log.info('book_index[book_url] = book_title : ', book_index) if ratio == 1: log.info( "Perfect match, we got it and we can stop looking further") break # we have a perfect match no need to examine other authors if debug: log.info('return book_index from ret_book_per_author_index\n') return book_index def ISBN_ret_book_index(self, log, br, isbn, book_index): # Trouver la reference d'un livre (titre ou ISBN) dans la soupe produite par noosfere # retourne book_index{}, un dictionnaire avec key=book_url, val=title # L'idée est de trouver UNE seule reference... # Attention: on retourne une reference qui peut contenir PLUSIEURs volumes # C'est a dire: différents editeurs, différentes re-éditions et/ou, meme, un titre different... YESss) # # Find the book's reference (either title or ISBN) in the returned soup from noosfere # returns book_index{}, a dictionnary with key=book_url, val=title # The idea is to find ONE unique reference... # Caution: the reference may contains several volumes, # each with potentialy a different editor, a different edition date,... and even a different title # debug = self.dbg_lvl & 1 log.info("\nIn ISBN_ret_book_index(self, log, br, isbn, book_index)") # if isbn valid then we want to select exact match (correspondance exacte = MOTS-CLEFS) rkt = { "Mots": isbn, "livres": "livres", "ModeMoteur": "MOTS-CLEFS", "ModeRecherche": "AND", "recherche": "1", "Envoyer": "Envoyer" } url = "https://www.noosfere.org/livres/noosearch.asp" soup = ret_soup(log, self.dbg_lvl, br, url, rkt=rkt)[0] tmp_rbi = soup.select('a[href*="ditionsLivre.asp"]') if len(tmp_rbi): for i in range(len(tmp_rbi)): if debug: log.info( "tmp_rbi[" + str(i) + "].text, tmp_rbi[" + str(i) + "]['href'] : ", tmp_rbi[i].text, tmp_rbi[i]["href"]) book_index[tmp_rbi[i]["href"]] = tmp_rbi[i].text if debug: log.info("book_index : ", book_index) log.info("return book_index from ISBN_ret_book_index\n") return book_index def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): # this is the entry point... # Note this method will retry without identifiers automatically... read can be resubmitted from inside it # if no match is found with identifiers. # log.info('self.dgb_lvl : ', self.dbg_lvl) log.info('self.extended_publisher : ', self.extended_publisher) log.info('self.priority_handling : ', self.priority_handling) log.info('self.must_be_editor : ', self.must_be_editor) debug = self.dbg_lvl & 1 log.info( '\nEntering identify(self, log, result_queue, abort, title=None, authors=None,identifiers={}, timeout=30)' ) if debug: log.info('log : ', log) log.info('result_queue : ', result_queue) log.info('abort : ', abort) log.info('title : ', title) log.info('authors : ', authors, type(authors)) log.info('identifiers : ', identifiers, type(identifiers)) log.info('\n') br = self.browser isbn = identifiers.get('isbn', None) if isbn: isbn = verify_isbn(log, self.dbg_lvl, isbn) log.info('ISBN value is : ', isbn) # the nsfr_id is designed to be the significant part of the url: # that is the number after the "=" in the url containing "niourf.asp?numlivre" # on can force the access to a particular volume by setting the value of nsfr_id to vl$<number> # could be an entry point if I can make sure that noosfere DB is alone and in interactive mode... nsfr_id = identifiers.get('nsfr_id', None) log.info('nsfr_id value is : ', nsfr_id) log.info('"Clean" both the authors list and the title... ') if authors: for i in range(len(authors)): authors[i] = ret_clean_text(log, self.dbg_lvl, authors[i]) if title: title = ret_clean_text(log, self.dbg_lvl, title) log.info('getting one or more book url') book_index = { } # book_index={} is a dict: {key:ref} with: book_url, book_title = key, ref if nsfr_id: log.info('trying noosfere id, ', nsfr_id) nsfr = nsfr_id.split("$") if "bk" in nsfr[0]: url = "/livres/EditionsLivre.asp?numitem=" + nsfr[1] if "vl" in nsfr[2]: url = "/livres/niourf.asp?numlivre=" + nsfr[3] book_index[url] = title elif "vl" in nsfr[0]: url = "/livres/niourf.asp?numlivre=" + nsfr[1] book_index[url] = title else: log.info('noosfere id not valid...') if not book_index: log.info('trying ISBN', isbn) if isbn: book_index = self.ISBN_ret_book_index(log, br, isbn, book_index) if not len(book_index): log.error("This ISBN was not found: ", isbn, "trying with title", title, "and author", authors) return self.identify(log, result_queue, abort, title=title, authors=authors, timeout=timeout) elif title and authors: log.info('trying using authors and title') author_index = self.ret_author_index(log, br, authors) if len(author_index): book_index = self.ret_book_per_author_index( log, br, author_index, title, book_index) if not len(author_index): log.info("Désolé, aucun auteur trouvé avec : ", authors) return # here maybe try with title alone... a dessiner lrp todo... ouais peut-etre pour le cas ou l'auteur serait trop noyé dans une masse de noms similaires if not book_index: log.error("No book found in noosfere... ") return if abort.is_set(): log.info('abort was set... aborting... ') return tmp_list, i = [], 0 for key, ref in book_index.items(): book_url, book_title = key, ref if debug: log.info("sending to worker", i, "book_url, book_title : ", book_url, ", ", book_title) i += 1 tmp_list.append((book_url, book_title)) log.info('\nCreating each worker... ') from calibre_plugins.noosfere.worker import Worker workers = [ Worker(log, data[0], data[1], isbn, result_queue, br, i, self, self.dbg_lvl) for i, data in enumerate(tmp_list) ] for w in workers: w.start() # Don't send all requests at the same time time.sleep(0.2) while not abort.is_set(): a_worker_is_alive = False for w in workers: w.join(0.2) if abort.is_set(): log.info('abort was set while in loop... aborting... ') break if w.is_alive(): a_worker_is_alive = True if not a_worker_is_alive: break if debug: log.info("return None from identify") return None def download_cover(self, log, result_queue, abort, title=None, authors=None, identifiers={}, timeout=30): # willl download cover from Noosfere provided it was found (and then cached)... If not, it will # run the metadata download and try to cache the cover url... cached_url = self.get_cached_cover_url(identifiers) if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): return results = [] while True: try: results.append(rq.get_nowait()) except Empty: break results.sort(key=self.identify_results_keygen( title=title, authors=authors, identifiers=identifiers)) for mi in results: cached_url = self.get_cached_cover_url(mi.identifiers) if cached_url is not None: break if cached_url is None: log.info('No cover found') return if abort.is_set(): return br = self.browser log('Downloading cover from:', cached_url) try: cdata = br.open_novisit(cached_url, timeout=timeout).read() result_queue.put((self, cdata)) except: log.exception('Failed to download cover from:', cached_url)