class KindleHighResCovers(Source):
    name = 'Kindle hi-res covers'
    description = 'Downloads high resolution covers for Kindle editions from Amazon'
    capabilities = frozenset(['cover'])
    author = 'Leonardo Brondani Schenkel <*****@*****.**>'
    version = (0, 4, 0)
    can_get_multiple_covers = True

    KEY_MAX_COVERS = 'max_covers'

    options = (Option(
        KEY_MAX_COVERS, 'number', 2, _('Maximum number of covers to get'),
        _('The maximum number of covers to get from amazon.com (since we try to get the covers from 2 sources, you might end up with two versions of each retrieved cover).'
          )), )

    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=None,
                       identifiers={},
                       timeout=60,
                       get_best_cover=False):
        urls = get_cover_urls(log, title, authors, identifiers, timeout)
        log.info('Cover URLs: ' + repr(urls))

        if urls:
            log.info('Create link to download cover')
            self.download_multiple_covers(title, authors, urls, False, timeout,
                                          result_queue, abort, log, None)
Exemplo n.º 2
0
class BigBookSearch(Source):

    name = 'Big Book Search'
    description = _(
        'Downloads multiple book covers from Amazon. Useful to find alternate covers.'
    )
    capabilities = frozenset(['cover'])
    config_help_message = _('Configure the Big Book Search plugin')
    can_get_multiple_covers = True
    options = (Option(
        'max_covers', 'number', 5, _('Maximum number of covers to get'),
        _('The maximum number of covers to process from the search result')), )
    supports_gzip_transfer_encoding = True

    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=None,
                       identifiers={},
                       timeout=30,
                       get_best_cover=False):
        if not title:
            return
        br = self.browser
        tokens = tuple(self.get_title_tokens(title)) + tuple(
            self.get_author_tokens(authors))
        urls = get_urls(br, tokens)
        self.download_multiple_covers(title, authors, urls, get_best_cover,
                                      timeout, result_queue, abort, log)
Exemplo n.º 3
0
class GoogleImages(Source):

    name = 'Google Images'
    description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.')
    capabilities = frozenset(['cover'])
    config_help_message = _('Configure the Google Image Search plugin')
    can_get_multiple_covers = True
    options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
                      _('The maximum number of covers to process from the google search result')),
               Option('size', 'choices', 'svga', _('Cover size'),
                      _('Search for covers larger than the specified size'),
                      choices=OrderedDict((
                          ('any', _('Any size'),),
                          ('l', _('Large'),),
                          ('qsvga', _('Larger than %s')%'400x300',),
                          ('vga', _('Larger than %s')%'640x480',),
                          ('svga', _('Larger than %s')%'600x800',),
                          ('xga', _('Larger than %s')%'1024x768',),
                          ('2mp', _('Larger than %s')%'2 MP',),
                          ('4mp', _('Larger than %s')%'4 MP',),
                      ))),
    )

    def download_cover(self, log, result_queue, abort,
            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        if not title:
            return
        timeout = max(60, timeout)  # Needs at least a minute
        title = ' '.join(self.get_title_tokens(title))
        author = ' '.join(self.get_author_tokens(authors))
        urls = self.get_image_urls(title, author, log, abort, timeout)
        self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)

    def get_image_urls(self, title, author, log, abort, timeout):
        from calibre.utils.ipc.simple_worker import fork_job, WorkerError
        try:
            return fork_job('calibre.ebooks.metadata.sources.google_images',
                    'search', args=(title, author, self.prefs['size'], timeout), no_output=True, abort=abort, timeout=timeout)['result']
        except WorkerError as e:
            if e.orig_tb:
                log.error(e.orig_tb)
            log.exception('Searching google failed:' + as_unicode(e))
        except Exception as e:
            log.exception('Searching google failed:' + as_unicode(e))

        return []
Exemplo n.º 4
0
class Douban(Source):
    name = "Douban Book"
    author = "Li Fanxi, xcffl, jnozsc, else"
    version = (4, 0, 1)
    minimum_calibre_version = (5, 0, 0)

    description = ("Downloads metadata and covers from Douban.com. "
                   "Useful only for Chinese language books.")

    capabilities = frozenset(["identify", "cover"])
    touched_fields = frozenset([
        "title",
        "authors",
        "tags",
        "pubdate",
        "comments",
        "publisher",
        "identifier:isbn",
        "rating",
        "identifier:douban",
    ])  # language currently disabled
    supports_gzip_transfer_encoding = True
    cached_cover_url_is_reliable = True

    ISBN_URL = "http://douban.com/isbn/"
    SUBJECT_URL = "http://book.douban.com/subject/"
    DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/'

    options = (Option(
        "include_subtitle_in_title",
        "bool",
        True,
        ("Include subtitle in book title:"),
        ("Whether to append subtitle in the book title."),
    ), )

    def identify(
        self,
        log,
        result_queue,
        abort,
        title=None,
        authors=None,
        identifiers={},
        timeout=30,
    ):
        import json

        time.sleep(random.randint(1, 3))

        log.info("start get metadata from douban...")
        log.info(str(identifiers))
        # query = self.create_query(log, title=title, authors=authors, identifiers=identifiers)
        book = self.get_book(log, identifiers)

        # There is no point running these queries in threads as douban
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(log, book, abort, result_queue, timeout)

        return None

    def to_metadata(self, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        log.info("to_metadata")
        douban_id = entry_.get("id")
        title = entry_.get("title")
        description = entry_.get("summary")
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get("publisher")
        isbn = entry_.get("isbn13")  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get("pubdate")
        authors = entry_.get("author")
        # authors = "author"
        book_tags = entry_.get("tags")
        rating = entry_.get("rating")
        cover_url = entry_.get("cover")
        series = entry_.get("series")

        if not authors:
            authors = [("Unknown")]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {"douban": douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(""), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = book_tags

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except BaseException:
                log.error("Failed to parse pubdate %r" % pubdate)

        if rating:
            try:
                # mi.publisher += "#PrB.rating#" + str(rating)
                mi.rating = rating / 2.0
            except BaseException:
                log.exception("Failed to parse rating")
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find("book-default") == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series

        return mi

    # }}}

    def get_isbn_url(self, isbn):  # {{{
        if isbn is not None:
            return self.ISBN_URL + isbn
        else:
            return ""

    # }}}

    def get_douban_url(self, identifiers):
        isbn = self.get_book_isbn(identifiers)
        url = self.get_isbn_url(isbn)
        if url:
            return url
        tup = self.get_book_url(identifiers)
        if tup:
            return tup[2]

    def get_book_url(self, identifiers):  # {{{
        db = identifiers.get('douban', None)
        if db is not None:
            return ('douban', db, self.DOUBAN_BOOK_URL % db)

    # }}}

    def get_book_isbn(self, identifiers):
        isbn = check_isbn(identifiers.get("isbn", None))
        return isbn

    def download_cover(
        self,
        log,
        result_queue,
        abort,
        title=None,
        authors=None,
        identifiers={},
        timeout=30,
        get_best_cover=False,
    ):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info("No cached cover found, running identify")
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info("No cover found")
            return

        if abort.is_set():
            return
        br = self.browser
        log("Downloading cover from:", cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            if cdata:
                result_queue.put((self, cdata))
        except BaseException:
            log.exception("Failed to download cover from:", cached_url)

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        db = identifiers.get("douban", None)
        if db is None:
            isbn = identifiers.get("isbn", None)
            if isbn is not None:
                db = self.cached_isbn_to_identifier(isbn)
        if db is not None:
            url = self.cached_identifier_to_cover_url(db)

        return url

    # }}}

    def get_all_details(self, log, book, abort, result_queue, timeout):  # {{{
        try:
            log.info("get_all_details")
            ans = self.to_metadata(log, book, timeout)
            if isinstance(ans, Metadata):
                ans.source_relevance = 0
                douban_id = ans.identifiers["douban"]
                isbn = book.get("isbn13")
                self.cache_isbn_to_identifier(isbn, douban_id)
                if ans.has_douban_cover:
                    self.cache_identifier_to_cover_url(douban_id,
                                                       ans.has_douban_cover)
                self.clean_downloaded_metadata(ans)
                result_queue.put(ans)
        except BaseException:
            log.exception("Failed to get metadata for identify entry:",
                          book["id"])
        if abort.is_set():
            return

    # }}}

    def get_book(self, log, identifiers={}):
        log.info("start get book......")
        url = self.get_douban_url(identifiers)
        html = self.__get_html(url)
        if html == -1:
            # log.info("book not found: " + isbn)
            return -1

        soup = self.__get_soup(html=html)
        infos = self.__get_infos(soup=soup)
        isbn = self.__get_isbn(log, identifiers, soup, infos)
        book = {"isbn13": isbn}
        book["author"] = self.__get_authors(infos)
        book["publisher"] = self.__get_info(infos, "出版社:")
        book["pubdate"] = self.__get_info(infos, "出版年:")
        book["series"] = self.__get_info(infos, "丛书:")

        book["id"] = self.__get_id(soup=soup)
        book["tags"] = self.__get_tags(soup=soup)
        book["rating"] = self.__get_score(soup=soup)
        book["title"] = self.__get_title(soup=soup)
        book["summary"] = self.__get_intro(soup=soup)
        book["cover"] = self.__get_cover(soup=soup)
        return book

    def __get_html(self, url):
        headers_ = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
        }
        request = urllib.request.Request(url, headers=headers_)
        try:
            response = urllib.request.urlopen(request)
        except BaseException:
            return -1
        html = response.read().decode("utf-8")
        return html

    def __get_soup(self, html=""):
        soup = BeautifulSoup(html, "lxml", exclude_encodings="utf-8")
        return soup

    def __get_infos(self, soup):
        soupSelect = str(soup.select("#info"))
        soupTemp = BeautifulSoup(str(soupSelect),
                                 "lxml",
                                 exclude_encodings="utf-8")
        infosTemp = soupTemp.text.splitlines()
        infos = []
        for info in infosTemp:
            tmp = info.strip()
            if tmp and tmp != "/":
                infos.append(tmp)
        infos.remove("[")
        infos.remove("]")
        return infos

    def __get_info(self, infos, name):
        for token in infos:
            if token.find(name) != -1:
                return token[len(name) + 1:]
        return ""

    def __get_authors(self, infos):
        begin = -1
        end = -1
        i = 0
        for token in infos:
            if token == "作者:":
                begin = i
            elif token.find("出版社:") != -1:
                end = i + 1
                break
            else:
                i = i + 1
        authors = []
        if begin == -1:
            return authors
        if end == -1:
            authors.append(infos[begin + 1])
            return authors
        else:
            for i in range(begin + 1, end):
                author = infos[i].strip()
                author = author.replace("【", "[")
                author = author.replace("】", "]")
                author = author.replace("(", "[")
                author = author.replace(")", "]")
                author = author.replace("〔", "[")
                author = author.replace("〕", "]")
                author = author.replace("(", "[")
                author = author.replace(")", "]")
                author = author.replace("]", "] ")
                author = author.replace("•", "·")
                author = author.replace("・", "·")
                authors.append(author)
            return authors

    def __get_id(self, soup):
        idSelects = str(soup.select("meta")).split()
        for item in idSelects:
            idIndex = item.find("douban.com/book/subject/")
            if idIndex != -1:
                id = item[idIndex + 24:-2]
                return id
        return 0

    def __get_tags(self, soup):
        tagSelect = str(soup.select("#db-tags-section > div"))
        tagTemp = BeautifulSoup(str(tagSelect),
                                "lxml",
                                exclude_encodings="utf-8")
        tagText = tagTemp.text
        tags = tagText.split()
        tags.remove("[")
        tags.remove("]")
        return tags

    def __get_cover(self, soup):
        coverSelect = str(soup.select("#mainpic > a > img"))
        tempCover = str(
            BeautifulSoup(str(coverSelect), "lxml", exclude_encodings="utf-8"))
        index1 = tempCover.find("src=")
        tempCover = tempCover[index1 + 5:]
        index2 = tempCover.find('"')
        tempCover = tempCover[:index2]
        return tempCover

    def __get_score(self, soup):
        soupSelect = str(
            soup.select(
                "#interest_sectl > div > div.rating_self.clearfix > strong"))
        soupTemp = BeautifulSoup(str(soupSelect),
                                 "lxml",
                                 exclude_encodings="utf-8")
        score = soupTemp.text.strip("[] \n\t")
        if score:
            s = float(score)
            return s
        else:
            return 0.0

    def __get_title(self, soup):
        soupSelect = str(soup.select("body>div>h1>span"))
        soupTemp = BeautifulSoup(str(soupSelect),
                                 "lxml",
                                 exclude_encodings="utf-8")
        return str(soupTemp.text).strip("[] \n\t")

    def __get_intro(self, soup):
        soupSelect = soup.select("#link-report")
        soupTemp = BeautifulSoup(str(soupSelect),
                                 "lxml",
                                 exclude_encodings="utf-8")
        intro = str(soupTemp.text).strip("[] \n\t")
        find = intro.find("(展开全部)")
        if find != -1:
            intro = intro[find + 6:]
        return intro.strip("[] \n\t")

    def __get_isbn(self, log, identifiers, soup, infos):
        isbn = identifiers.get("isbn", None)
        if isbn:
            return isbn
        pattern = re.compile(r"ISBN: (\d+)", re.IGNORECASE)
        isbn = ''
        for info in infos:
            match = pattern.match(info)
            if match:
                isbn = match.group(1)
                break
        return isbn
Exemplo n.º 5
0
class Douban(Source):

    name = 'Douban Books'
    author = 'Li Fanxi'
    version = (2, 1, 2)
    minimum_calibre_version = (2, 80, 0)

    description = _('Downloads metadata and covers from Douban.com. '
                    'Useful only for Chinese language books.')

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
        'identifier:isbn', 'rating', 'identifier:douban'
    ])  # language currently disabled
    supports_gzip_transfer_encoding = True
    cached_cover_url_is_reliable = True

    DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
    DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/'

    options = (Option('include_subtitle_in_title', 'bool', True,
                      _('Include subtitle in book title:'),
                      _('Whether to append subtitle in the book title.')), )

    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.date import parse_date, utcnow
        from calibre.utils.cleantext import clean_ascii_chars

        XPath = partial(etree.XPath, namespaces=NAMESPACES)
        entry = XPath('//atom:entry')
        entry_id = XPath('descendant::atom:id')
        title = XPath('descendant::atom:title')
        description = XPath('descendant::atom:summary')
        subtitle = XPath("descendant::db:attribute[@name='subtitle']")
        publisher = XPath("descendant::db:attribute[@name='publisher']")
        isbn = XPath("descendant::db:attribute[@name='isbn13']")
        date = XPath("descendant::db:attribute[@name='pubdate']")
        creator = XPath("descendant::db:attribute[@name='author']")
        booktag = XPath("descendant::db:tag/attribute::name")
        rating = XPath("descendant::gd:rating/attribute::average")
        cover_url = XPath(
            "descendant::atom:link[@rel='image']/attribute::href")

        def get_text(extra, x):
            try:
                ans = x(extra)
                if ans:
                    ans = ans[0].text
                    if ans and ans.strip():
                        return ans.strip()
            except:
                log.exception('Programming error:')
            return None

        id_url = entry_id(entry_)[0].text.replace('http://', 'https://')
        douban_id = id_url.split('/')[-1]
        title_ = ': '.join([x.text for x in title(entry_)]).strip()
        subtitle = ': '.join([x.text for x in subtitle(entry_)]).strip()
        if self.prefs['include_subtitle_in_title'] and len(subtitle) > 0:
            title_ = title_ + ' - ' + subtitle
        authors = [x.text.strip() for x in creator(entry_) if x.text]
        if not authors:
            authors = [_('Unknown')]
        if not id_url or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title_, authors)
        mi.identifiers = {'douban': douban_id}
        try:
            log.info(id_url)
            raw = get_details(browser, id_url, timeout)
            feed = etree.fromstring(
                xml_to_unicode(clean_ascii_chars(raw),
                               strip_encoding_pats=True)[0],
                parser=etree.XMLParser(recover=True,
                                       no_network=True,
                                       resolve_entities=False))
            extra = entry(feed)[0]
        except:
            log.exception('Failed to get additional details for', mi.title)
            return mi
        mi.comments = get_text(extra, description)
        mi.publisher = get_text(extra, publisher)

        # ISBN
        isbns = []
        for x in [t.text for t in isbn(extra)]:
            if check_isbn(x):
                isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        try:
            btags = [x for x in booktag(extra) if x]
            tags = []
            for t in btags:
                atags = [y.strip() for y in t.split('/')]
                for tag in atags:
                    if tag not in tags:
                        tags.append(tag)
        except:
            log.exception('Failed to parse tags:')
            tags = []
        if tags:
            mi.tags = [x.replace(',', ';') for x in tags]

        # pubdate
        pubdate = get_text(extra, date)
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        # Ratings
        if rating(extra):
            try:
                mi.rating = float(rating(extra)[0]) / 2.0
            except:
                log.exception('Failed to parse rating')
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url(extra)
        if u:
            u = u[0].replace('/spic/', '/lpic/')
            # If URL contains "book-default", the book doesn't have a cover
            if u.find('book-default') == -1:
                mi.has_douban_cover = u
        return mi

    # }}}

    def get_book_url(self, identifiers):  # {{{
        db = identifiers.get('douban', None)
        if db is not None:
            return ('douban', db, self.DOUBAN_BOOK_URL % db)

    # }}}

    def create_query(self,
                     log,
                     title=None,
                     authors=None,
                     identifiers={}):  # {{{
        try:
            from urllib.parse import urlencode
        except ImportError:
            from urllib import urlencode
        SEARCH_URL = 'https://api.douban.com/book/subjects?'
        ISBN_URL = 'https://api.douban.com/book/subject/isbn/'
        SUBJECT_URL = 'https://api.douban.com/book/subject/'

        q = ''
        t = None
        isbn = check_isbn(identifiers.get('isbn', None))
        subject = identifiers.get('douban', None)
        if isbn is not None:
            q = isbn
            t = 'isbn'
        elif subject is not None:
            q = subject
            t = 'subject'
        elif title or authors:

            def build_term(prefix, parts):
                return ' '.join(x for x in parts)

            title_tokens = list(self.get_title_tokens(title))
            if title_tokens:
                q += build_term('title', title_tokens)
            author_tokens = list(
                self.get_author_tokens(authors, only_first_author=True))
            if author_tokens:
                q += ((' ' if q != '' else '') +
                      build_term('author', author_tokens))
            t = 'search'
        q = q.strip()
        if isinstance(q, type(u'')):
            q = q.encode('utf-8')
        if not q:
            return None
        url = None
        if t == "isbn":
            url = ISBN_URL + q
        elif t == 'subject':
            url = SUBJECT_URL + q
        else:
            url = SEARCH_URL + urlencode({
                'q': q,
            })
        if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
            if t == "isbn" or t == "subject":
                url = url + "?apikey=" + self.DOUBAN_API_KEY
            else:
                url = url + "&apikey=" + self.DOUBAN_API_KEY
        return url

    # }}}

    def download_cover(
            self,
            log,
            result_queue,
            abort,  # {{{
            title=None,
            authors=None,
            identifiers={},
            timeout=30,
            get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            if cdata:
                result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        db = identifiers.get('douban', None)
        if db is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                db = self.cached_isbn_to_identifier(isbn)
        if db is not None:
            url = self.cached_identifier_to_cover_url(db)

        return url

    # }}}

    def get_all_details(
            self,
            br,
            log,
            entries,
            abort,  # {{{
            result_queue,
            timeout):
        from lxml import etree
        for relevance, i in enumerate(entries):
            try:
                ans = self.to_metadata(br, log, i, timeout)
                if isinstance(ans, Metadata):
                    ans.source_relevance = relevance
                    db = ans.identifiers['douban']
                    for isbn in getattr(ans, 'all_isbns', []):
                        self.cache_isbn_to_identifier(isbn, db)
                    if ans.has_douban_cover:
                        self.cache_identifier_to_cover_url(
                            db, ans.has_douban_cover)
                    self.clean_downloaded_metadata(ans)
                    result_queue.put(ans)
            except:
                log.exception('Failed to get metadata for identify entry:',
                              etree.tostring(i))
            if abort.is_set():
                break

    # }}}

    def identify(
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,  # {{{
            identifiers={},
            timeout=30):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.cleantext import clean_ascii_chars

        XPath = partial(etree.XPath, namespaces=NAMESPACES)
        entry = XPath('//atom:entry')

        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)
        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(
                clean_ascii_chars(raw), strip_encoding_pats=True)[0],
                                    parser=parser)
            entries = entry(feed)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
        if not entries and identifiers and title and authors and \
                not abort.is_set():
            return self.identify(log,
                                 result_queue,
                                 abort,
                                 title=title,
                                 authors=authors,
                                 timeout=timeout)

        # There is no point running these queries in threads as douban
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(br, log, entries, abort, result_queue, timeout)

        return None
Exemplo n.º 6
0
class Douban(Source):

    name = 'Douban Books Proxy'
    author = 'Li Fanxi & Driftcrow'
    version = (2, 1, 2)
    minimum_calibre_version = (2, 80, 0)

    description = _('Downloads metadata and covers from Douban.com. '
                    'Useful only for Chinese language books.')

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
        'identifier:isbn', 'rating', 'identifier:douban'
    ])  # language currently disabled
    supports_gzip_transfer_encoding = True
    cached_cover_url_is_reliable = True

    DOUBAN_API_KEY = '0bd1672394eb1ebf2374356abec15c3d'
    DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/'
    DOUBAN_BOOK_PROXY = 'https://douban.uieee.com/v2/book/'
    # SEARCH_URL = 'https://douban.uieee.com/v2/book/search?'

    options = (Option('include_subtitle_in_title', 'bool', True,
                      _('Include subtitle in book title:'),
                      _('Whether to append subtitle in the book title.')), )

    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.date import parse_date, utcnow
        from calibre.utils.cleantext import clean_ascii_chars

        # log.info('entry_ is: ',entry_)
        id_url = entry_['url']
        douban_id = entry_['id']
        title_ = entry_['title']
        subtitle = entry_['subtitle']
        authors = [x.strip() for x in entry_['author'] if x]
        if not authors:
            authors = [_('Unknown')]

        mi = Metadata(title_, authors)
        mi.identifiers = {'douban': douban_id}
        mi.comments = entry_['summary']
        mi.publisher = entry_['publisher']

        # ISBN
        mi.isbn = entry_['isbn10']
        mi.all_isbns = [entry_['isbn10'], entry_['isbn13']]

        # Tags
        mi.tags = [x['name'].strip() for x in entry_['tags']]

        # pubdate
        pubdate = entry_['pubdate']
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        # Ratings
        mi.rating = float(entry_['rating']['average']) / 2.0

        # Cover
        mi.has_douban_cover = entry_['image']
        return mi

    # }}}

    def get_book_url(self, identifiers):  # {{{
        db = identifiers.get('douban', None)
        if db is not None:
            return ('douban', db, self.DOUBAN_BOOK_URL % db)

    # }}}

    def create_query(self,
                     log,
                     title=None,
                     authors=None,
                     identifiers={}):  # {{{
        try:
            from urllib.parse import urlencode
        except ImportError:
            from urllib import urlencode
        SEARCH_URL = self.DOUBAN_BOOK_PROXY + 'search?'
        ISBN_URL = self.DOUBAN_BOOK_PROXY + 'isbn/'
        SUBJECT_URL = self.DOUBAN_BOOK_PROXY + 'subject/'

        q = ''
        t = None
        isbn = check_isbn(identifiers.get('isbn', None))
        subject = identifiers.get('douban', None)
        if isbn is not None:
            q = isbn
            t = 'isbn'
        elif subject is not None:
            q = subject
            t = 'subject'
        elif title or authors:

            def build_term(prefix, parts):
                return ' '.join(x for x in parts)

            title_tokens = list(self.get_title_tokens(title))
            if title_tokens:
                q += build_term('title', title_tokens)
            author_tokens = list(
                self.get_author_tokens(authors, only_first_author=True))
            if author_tokens:
                q += ((' ' if q != '' else '') +
                      build_term('author', author_tokens))
            t = 'search'
        q = q.strip()
        if isinstance(q, type(u'')):
            q = q.encode('utf-8')
        if not q:
            return None
        url = None
        if t == "isbn":
            url = ISBN_URL + q
        elif t == 'subject':
            url = SUBJECT_URL + q
        else:
            url = SEARCH_URL + urlencode({
                'q': q,
            })
        # if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
        #     if t == "isbn" or t == "subject":
        #         url = url + "?apikey=" + self.DOUBAN_API_KEY
        #     else:
        #         url = url + "&apikey=" + self.DOUBAN_API_KEY
        return url

    # }}}

    def download_cover(
            self,
            log,
            result_queue,
            abort,  # {{{
            title=None,
            authors=None,
            identifiers={},
            timeout=30,
            get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            if cdata:
                result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        db = identifiers.get('douban', None)
        if db is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                db = self.cached_isbn_to_identifier(isbn)
        if db is not None:
            url = self.cached_identifier_to_cover_url(db)

        return url

    # }}}

    def get_all_details(
            self,
            br,
            log,
            entries,
            abort,  # {{{
            result_queue,
            timeout):
        # for relevance, i in enumerate(entries):
        for i in entries:
            try:
                ans = self.to_metadata(br, log, i, timeout)
                if isinstance(ans, Metadata):
                    # ans.source_relevance = relevance
                    db = ans.identifiers['douban']
                    for isbn in getattr(ans, 'all_isbns', []):
                        self.cache_isbn_to_identifier(isbn, db)
                    if ans.has_douban_cover:
                        self.cache_identifier_to_cover_url(
                            db, ans.has_douban_cover)
                    self.clean_downloaded_metadata(ans)

                    result_queue.put(ans)
            except:
                log.exception('Failed to get metadata for identify entry:', i)
            if abort.is_set():
                break

    # }}}

    def identify(
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,  # {{{
            identifiers={},
            timeout=30):
        import json
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.cleantext import clean_ascii_chars

        # XPath = partial(etree.XPath, namespaces=NAMESPACES)
        # entry          = XPath('//atom:entry')

        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)
        try:
            # parser = etree.XMLParser(recover=True, no_network=True)
            # log.info('parser is ', parser)
            # feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
            #     strip_encoding_pats=True)[0], parser=parser)

            # log.info('feed is ', feed)
            # entries = entry(feed)
            entries = []
            data = json.loads(raw)
            if data.has_key('books'):
                entries = data['books']
            else:
                entries.append(data)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
        if not entries and identifiers and title and authors and \
                not abort.is_set():
            return self.identify(log,
                                 result_queue,
                                 abort,
                                 title=title,
                                 authors=authors,
                                 timeout=timeout)

        # There is no point running these queries in threads as douban
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(br, log, entries, abort, result_queue, timeout)

        return None
Exemplo n.º 7
0
class JD(Source):

    name = 'JD'
    version = (0, 0, 1)
    author = 'Lewix Liu'
    minimum_calibre_version = (3, 6, 0)
    description = _(
        'Downloads metadata and covers from JD.com - A online book seller in China'
    )

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'pubdate', 'comments', 'publisher', 'series',
        'identifier:isbn', 'identifier:jd'
    ])
    supports_gzip_transfer_encoding = True
    has_html_comments = True
    options = (Option('add_authors', 'bool', False,
                      _('Add authors to search books:'),
                      _('Whether to add authors to search books.')), )

    @property
    def user_agent(self):
        # Pass in an index to random_user_agent() to test with a particular
        # user agent
        #return random_user_agent(allow_ie=False)
        return 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'

    def _get_book_url(self, sku):
        if sku:
            return 'https://item.jd.com/{}.html'.format(sku)

    def get_book_url(self, identifiers):  # {{{
        sku = identifiers.get('jd', None)
        if sku:
            return 'JD', sku, self._get_book_url(sku)

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        sku = identifiers.get('jd', None)
        if not sku:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                sku = self.cached_isbn_to_identifier(isbn)
        return self.cached_identifier_to_cover_url(sku)

    # }}}

    def create_query(self, log, title=None, authors=None, identifiers={}):
        try:
            from urllib.parse import urlencode
        except ImportError:
            from urllib import urlencode
        import time
        BASE_URL = 'https://search.jd.com/Search?'
        keywords = []
        isbn = check_isbn(identifiers.get('isbn', None))
        if isbn is not None:
            keywords.append(isbn)
        elif title:
            title_tokens = list(self.get_title_tokens(title))
            if title_tokens:
                keywords.extend(title_tokens)
            if self.prefs['add_authors']:
                author_tokens = self.get_author_tokens(authors,
                                                       only_first_author=True)
                if author_tokens:
                    keywords.extend(author_tokens)
        if not keywords:
            return None
        word = (' '.join(keywords)).encode('utf-8')
        params = {'keyword': word, 'enc': 'utf-8', 'wp': word, 'book': 'y'}
        return BASE_URL + urlencode(params)

    # }}}

    def identify(
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,  # {{{
            identifiers={},
            timeout=30):
        br = self.browser
        br.addheaders = [
            ('User-Agent',
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0'
             ),
            ('Accept',
             'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
             ), ('Accept-Language', 'en-US,en;q=0.8,zh-CN;q=0.5,zh;q=0.3'),
            ('Referer', 'https://www.jd.com/'), ('DNT', '1'),
            ('Connection', 'keep-alive'), ('Upgrade-Insecure-Requests', '1'),
            ('TE', 'Trailers')
        ]
        self.identifiers = identifiers
        if 'jd' in identifiers:
            items = [identifiers['jd']]
        else:
            query = self.create_query(log,
                                      title=title,
                                      authors=authors,
                                      identifiers=identifiers)
            if not query:
                log.error('Insufficient metadata to construct query:', query)
                return
            log('Using query URL:', query)
            try:
                raw = br.open(query, timeout=timeout).read().decode('utf-8')
            except Exception as e:
                log.exception('Failed to make identify query: %r' % query)
                return as_unicode(e)
            root = parse_html(raw)
            items = []
            items_low_prio = []
            items_tree = root.xpath('//*[@id="J_goodsList"]/ul/li')
            for item in items_tree:
                sku = item.get('data-sku')
                all_str = etree.tostring(item, method='text', encoding='utf-8')
                if all_str.find(u'自营') > 0:
                    items.append(sku)
                else:
                    items_low_prio.append(sku)
            items.extend(items_low_prio)

            if not items:
                log.error('Failed to get list of matching items')
                #log.debug('Response text:')
                #log.debug(raw)
                return

        if (not items and identifiers and title and authors
                and not abort.is_set()):
            if 'isbn' in identifiers:
                return
            identifiers.remove('jd')
            return self.identify(log,
                                 result_queue,
                                 abort,
                                 title=title,
                                 authors=authors,
                                 timeout=timeout)

        if not items:
            return

        workers = []
        items = items[:5]
        for i, item in enumerate(items):
            workers.append(
                Worker(item, i, result_queue, br.clone_browser(), timeout, log,
                       self))

        if not workers:
            return

        for w in workers:
            w.start()
            # Don't send all requests at the same time
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

    # }}}

    def download_cover(
            self,
            log,
            result_queue,
            abort,  # {{{
            title=None,
            authors=None,
            identifiers={},
            timeout=30,
            get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)  # TODO
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
Exemplo n.º 8
0
class GoogleImages(Source):

    name = 'Google Images'
    version = (1, 0, 2)
    minimum_calibre_version = (2, 80, 0)
    description = _('Downloads covers from a Google Image search. Useful to find larger/alternate covers.')
    capabilities = frozenset(['cover'])
    can_get_multiple_covers = True
    supports_gzip_transfer_encoding = True
    options = (Option('max_covers', 'number', 5, _('Maximum number of covers to get'),
                      _('The maximum number of covers to process from the Google search result')),
               Option('size', 'choices', 'svga', _('Cover size'),
                      _('Search for covers larger than the specified size'),
                      choices=OrderedDict((
                          ('any', _('Any size'),),
                          ('l', _('Large'),),
                          ('qsvga', _('Larger than %s')%'400x300',),
                          ('vga', _('Larger than %s')%'640x480',),
                          ('svga', _('Larger than %s')%'600x800',),
                          ('xga', _('Larger than %s')%'1024x768',),
                          ('2mp', _('Larger than %s')%'2 MP',),
                          ('4mp', _('Larger than %s')%'4 MP',),
                      ))),
    )

    def download_cover(self, log, result_queue, abort,
            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        if not title:
            return
        timeout = max(60, timeout)  # Needs at least a minute
        title = ' '.join(self.get_title_tokens(title))
        author = ' '.join(self.get_author_tokens(authors))
        urls = self.get_image_urls(title, author, log, abort, timeout)
        self.download_multiple_covers(title, authors, urls, get_best_cover, timeout, result_queue, abort, log)

    @property
    def user_agent(self):
        return random_user_agent(allow_ie=False)

    def get_image_urls(self, title, author, log, abort, timeout):
        from calibre.utils.cleantext import clean_ascii_chars
        try:
            from urllib.parse import urlencode
        except ImportError:
            from urllib import urlencode
        from collections import OrderedDict
        ans = OrderedDict()
        br = self.browser
        q = urlencode({'as_q': ('%s %s'%(title, author)).encode('utf-8')})
        if isinstance(q, bytes):
            q = q.decode('utf-8')
        sz = self.prefs['size']
        if sz == 'any':
            sz = ''
        elif sz == 'l':
            sz = 'isz:l,'
        else:
            sz = 'isz:lt,islt:%s,' % sz
        # See https://www.google.com/advanced_image_search to understand this
        # URL scheme
        url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(q, sz)
        log('Search URL: ' + url)
        raw = clean_ascii_chars(br.open(url).read().decode('utf-8'))
        root = parse_html(raw)
        results = root.xpath('//div/@data-tbnid')  # could also use data-id
        # from calibre.utils.ipython import ipython
        # ipython({'root': root, 'raw': raw, 'url': url, 'results': results})
        for tbnid in results:
            try:
                imgurl = imgurl_from_id(raw, tbnid)
            except Exception:
                continue
            if imgurl:
                ans[imgurl] = True
        return list(ans)
Exemplo n.º 9
0
class ISBNDB(Source):

    name = 'ISBNDB'
    version = (1, 0, 0)
    minimum_calibre_version = (2, 80, 0)
    description = _('Downloads metadata from isbndb.com')

    capabilities = frozenset(['identify'])
    touched_fields = frozenset(
        ['title', 'authors', 'identifier:isbn', 'comments', 'publisher'])
    supports_gzip_transfer_encoding = True
    # Shortcut, since we have no cached cover URLS
    cached_cover_url_is_reliable = False

    options = (Option(
        'isbndb_key', 'string', None, _('IsbnDB key:'),
        _('To use isbndb.com you have to sign up for a free account '
          'at isbndb.com and get an access key.')), )

    config_help_message = ('<p>' + _(
        'To use metadata from isbndb.com you must sign'
        ' up for a free account and get an isbndb key and enter it below.'
        ' Instructions to get the key are '
        '<a href="%s">here</a>.')) % 'https://isbndb.com/api/v1/docs/keys'

    def __init__(self, *args, **kwargs):
        Source.__init__(self, *args, **kwargs)

        prefs = self.prefs
        prefs.defaults['key_migrated'] = False
        prefs.defaults['isbndb_key'] = None

        if not prefs['key_migrated']:
            prefs['key_migrated'] = True
            try:
                from calibre.customize.ui import config
                key = config['plugin_customization']['IsbnDB']
                prefs['isbndb_key'] = key
            except:
                pass

    @property
    def isbndb_key(self):
        return self.prefs['isbndb_key']

    def is_configured(self):
        return self.isbndb_key is not None

    def create_query(self, title=None, authors=None, identifiers={}):  # {{{
        from urllib import quote
        base_url = BASE_URL % self.isbndb_key
        isbn = check_isbn(identifiers.get('isbn', None))
        q = ''
        if isbn is not None:
            q = 'index1=isbn&value1=' + isbn
        elif title or authors:
            tokens = []
            title_tokens = list(self.get_title_tokens(title))
            tokens += title_tokens
            author_tokens = self.get_author_tokens(authors,
                                                   only_first_author=True)
            tokens += author_tokens
            tokens = [
                quote(t.encode('utf-8') if isinstance(t, unicode) else t)
                for t in tokens
            ]
            q = '+'.join(tokens)
            q = 'index1=combined&value1=' + q

        if not q:
            return None
        if isinstance(q, unicode):
            q = q.encode('utf-8')
        return base_url + q

    # }}}

    def identify(
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,  # {{{
            identifiers={},
            timeout=30):
        if not self.is_configured():
            return
        query = self.create_query(title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            err = 'Insufficient metadata to construct query'
            log.error(err)
            return err

        results = []
        try:
            results = self.make_query(query,
                                      abort,
                                      title=title,
                                      authors=authors,
                                      identifiers=identifiers,
                                      timeout=timeout)
        except:
            err = 'Failed to make query to ISBNDb, aborting.'
            log.exception(err)
            return err

        if not results and identifiers.get('isbn', False) and title and authors and \
                not abort.is_set():
            return self.identify(log,
                                 result_queue,
                                 abort,
                                 title=title,
                                 authors=authors,
                                 timeout=timeout)

        for result in results:
            self.clean_downloaded_metadata(result)
            result_queue.put(result)

    def parse_feed(self, feed, seen, orig_title, orig_authors, identifiers):
        from lxml import etree

        def tostring(x):
            if x is None:
                return ''
            return etree.tostring(x, method='text', encoding=unicode).strip()

        orig_isbn = identifiers.get('isbn', None)
        title_tokens = list(self.get_title_tokens(orig_title))
        author_tokens = list(self.get_author_tokens(orig_authors))
        results = []

        def ismatch(title, authors):
            authors = lower(' '.join(authors))
            title = lower(title)
            match = not title_tokens
            for t in title_tokens:
                if lower(t) in title:
                    match = True
                    break
            amatch = not author_tokens
            for a in author_tokens:
                if lower(a) in authors:
                    amatch = True
                    break
            if not author_tokens:
                amatch = True
            return match and amatch

        bl = feed.find('BookList')
        if bl is None:
            err = tostring(feed.find('errormessage'))
            raise ValueError('ISBNDb query failed:' + err)
        total_results = int(bl.get('total_results'))
        shown_results = int(bl.get('shown_results'))
        for bd in bl.xpath('.//BookData'):
            isbn = check_isbn(bd.get('isbn', None))
            isbn13 = check_isbn(bd.get('isbn13', None))
            if not isbn and not isbn13:
                continue
            if orig_isbn and orig_isbn not in {isbn, isbn13}:
                continue
            title = tostring(bd.find('Title'))
            if not title:
                continue
            authors = []
            for au in bd.xpath('.//Authors/Person'):
                au = tostring(au)
                if au:
                    if ',' in au:
                        ln, _, fn = au.partition(',')
                        au = fn.strip() + ' ' + ln.strip()
                authors.append(au)
            if not authors:
                continue
            comments = tostring(bd.find('Summary'))
            id_ = (title, tuple(authors))
            if id_ in seen:
                continue
            seen.add(id_)
            if not ismatch(title, authors):
                continue
            publisher = tostring(bd.find('PublisherText'))
            if not publisher:
                publisher = None
            if publisher and 'audio' in publisher.lower():
                continue
            mi = Metadata(title, authors)
            mi.isbn = isbn
            mi.publisher = publisher
            mi.comments = comments
            results.append(mi)
        return total_results, shown_results, results

    def make_query(self,
                   q,
                   abort,
                   title=None,
                   authors=None,
                   identifiers={},
                   max_pages=10,
                   timeout=30):
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.utils.cleantext import clean_ascii_chars

        page_num = 1
        parser = etree.XMLParser(recover=True, no_network=True)
        br = self.browser

        seen = set()

        candidates = []
        total_found = 0
        while page_num <= max_pages and not abort.is_set():
            url = q.replace('&page_number=1&', '&page_number=%d&' % page_num)
            page_num += 1
            raw = br.open_novisit(url, timeout=timeout).read()
            feed = etree.fromstring(xml_to_unicode(
                clean_ascii_chars(raw), strip_encoding_pats=True)[0],
                                    parser=parser)
            total, found, results = self.parse_feed(feed, seen, title, authors,
                                                    identifiers)
            total_found += found
            candidates += results
            if total_found >= total or len(candidates) > 9:
                break

        return candidates
Exemplo n.º 10
0
class Ozon(Source):
    name = 'OZON.ru'
    description = _('Downloads metadata and covers from OZON.ru')

    capabilities = frozenset(['identify', 'cover'])

    touched_fields = frozenset([
        'title', 'authors', 'identifier:isbn', 'identifier:ozon', 'publisher',
        'pubdate', 'comments', 'series', 'rating', 'languages'
    ])
    # Test purpose only, test function does not like when sometimes some filed are empty
    # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
    #                          'publisher', 'pubdate', 'comments'])

    supports_gzip_transfer_encoding = True
    has_html_comments = True

    ozon_url = 'http://www.ozon.ru'

    # match any ISBN10/13. From "Regular Expressions Cookbook"
    isbnPattern = r'(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|'\
             '[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?'\
             '(?:[0-9]+[- ]?){2}[0-9X]'
    isbnRegex = re.compile(isbnPattern)

    optkey_strictmatch = 'strict_result_match'
    options = (Option(
        optkey_strictmatch, 'bool', False,
        _('Filter out less relevant hits from the search results'),
        _('Improve search result by removing less relevant hits. It can be useful to refine the search when there are many matches'
          )), )

    def get_book_url(self, identifiers):  # {{{
        import urllib2
        ozon_id = identifiers.get('ozon', None)
        res = None
        if ozon_id:
            url = '{}/context/detail/id/{}?partner={}'.format(
                self.ozon_url, urllib2.quote(ozon_id), _get_affiliateId())
            res = ('ozon', ozon_id, url)
        return res

    # }}}

    def create_query(self,
                     log,
                     title=None,
                     authors=None,
                     identifiers={}):  # {{{
        from urllib import quote_plus

        # div_book -> search only books, ebooks and audio books
        search_url = self.ozon_url + '/webservice/webservice.asmx/SearchWebService?searchContext=div_book&searchText='

        # for ozon.ru search we have to format ISBN with '-'
        isbn = _format_isbn(log, identifiers.get('isbn', None))
        if isbn and not '-' in isbn:
            log.error(
                "%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)"
                % (self.name, isbn))
            isbn = None

        ozonid = identifiers.get('ozon', None)

        qItems = set([ozonid, isbn])

        unk = unicode(_('Unknown')).upper()

        if title and title != unk:
            qItems.add(title)
        if authors and authors != [unk]:
            qItems |= frozenset(authors)

        qItems.discard(None)
        qItems.discard('')
        qItems = map(_quoteString, qItems)
        searchText = u' '.join(qItems).strip()
        if isinstance(searchText, unicode):
            searchText = searchText.encode('utf-8')
        if not searchText:
            return None

        search_url += quote_plus(searchText)
        log.debug(u'search url: %r' % search_url)
        return search_url

    # }}}

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=60):  # {{{
        from lxml import etree
        from calibre.ebooks.chardet import xml_to_unicode

        if not self.is_configured():
            return
        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            err = u'Insufficient metadata to construct query'
            log.error(err)
            return err

        try:
            raw = self.browser.open_novisit(query).read()

        except Exception as e:
            log.exception(u'Failed to make identify query: %r' % query)
            return as_unicode(e)

        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(xml_to_unicode(raw,
                                                   strip_encoding_pats=True,
                                                   assume_utf8=True)[0],
                                    parser=parser)
            entries = feed.xpath(
                '//*[local-name()="SearchItems" or local-name()="ItemDetail"]')
            if entries:
                metadata = self.get_metadata(log, entries, title, authors,
                                             identifiers)
                self.get_all_details(log, metadata, abort, result_queue,
                                     identifiers, timeout)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)

    # }}}

    def get_metadata(self, log, entries, title, authors, identifiers):  # {{{
        # some book titles have extra characters like this
        # TODO: make a twick
        #reRemoveFromTitle = None
        reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')

        title = unicode(title).upper() if title else ''
        if reRemoveFromTitle:
            title = reRemoveFromTitle.sub('', title)
        authors = map(_normalizeAuthorNameWithInitials,
                      map(unicode.upper, map(unicode,
                                             authors))) if authors else None
        ozon_id = identifiers.get('ozon', None)
        #log.debug(u'ozonid: ', ozon_id)

        unk = unicode(_('Unknown')).upper()

        if title == unk:
            title = None

        if authors == [unk] or authors == []:
            authors = None

        def in_authors(authors, miauthors):
            for author in authors:
                for miauthor in miauthors:
                    #log.debug(u'=> %s <> %s'%(author, miauthor))
                    if author in miauthor:
                        return True
            return None

        def calc_source_relevance(mi):  # {{{
            relevance = 0
            if title:
                mititle = unicode(mi.title).upper() if mi.title else ''
                if reRemoveFromTitle:
                    mititle = reRemoveFromTitle.sub('', mititle)
                if title in mititle:
                    relevance += 3
                elif mititle:
                    # log.debug(u'!!%s!'%mititle)
                    relevance -= 3
            else:
                relevance += 1

            if authors:
                miauthors = map(unicode.upper, map(
                    unicode, mi.authors)) if mi.authors else []
                if (in_authors(authors, miauthors)):
                    relevance += 3
                elif u''.join(miauthors):
                    # log.debug(u'!%s!'%u'|'.join(miauthors))
                    relevance -= 3
            else:
                relevance += 1

            if ozon_id:
                mozon_id = mi.identifiers['ozon']
                if ozon_id == mozon_id:
                    relevance += 100

            if relevance < 0:
                relevance = 0
            return relevance

        # }}}

        strict_match = self.prefs[self.optkey_strictmatch]
        metadata = []
        for entry in entries:
            mi = self.to_metadata(log, entry)
            relevance = calc_source_relevance(mi)
            # TODO findout which is really used
            mi.source_relevance = relevance
            mi.relevance_in_source = relevance

            if not strict_match or relevance > 0:
                metadata.append(mi)
                #log.debug(u'added metadata %s %s.'%(mi.title,  mi.authors))
            else:
                log.debug(
                    u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)'
                    % (mi.title, u' '.join(mi.authors), relevance))
        return metadata

    # }}}

    def get_all_details(self, log, metadata, abort, result_queue, identifiers,
                        timeout):  # {{{
        req_isbn = identifiers.get('isbn', None)

        for mi in metadata:
            if abort.is_set():
                break
            try:
                ozon_id = mi.identifiers['ozon']

                try:
                    self.get_book_details(log, mi, timeout)
                except:
                    log.exception(u'Failed to get details for metadata: %s' %
                                  mi.title)

                all_isbns = getattr(mi, 'all_isbns', [])
                if req_isbn and all_isbns and check_isbn(
                        req_isbn) not in all_isbns:
                    log.debug(u'skipped, no requested ISBN %s found' %
                              req_isbn)
                    continue

                for isbn in all_isbns:
                    self.cache_isbn_to_identifier(isbn, ozon_id)

                if mi.ozon_cover_url:
                    self.cache_identifier_to_cover_url(ozon_id,
                                                       mi.ozon_cover_url)

                self.clean_downloaded_metadata(mi)
                result_queue.put(mi)
            except:
                log.exception(u'Failed to get details for metadata: %s' %
                              mi.title)

    # }}}

    def to_metadata(self, log, entry):  # {{{
        xp_template = 'normalize-space(./*[local-name() = "{0}"]/text())'

        title = entry.xpath(xp_template.format('Name'))
        author = entry.xpath(xp_template.format('Author'))
        norm_authors = map(_normalizeAuthorNameWithInitials,
                           map(unicode.strip,
                               unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.xpath(xp_template.format('ID'))
        mi.identifiers = {'ozon': ozon_id}

        mi.comments = entry.xpath(xp_template.format('Annotation'))

        mi.ozon_cover_url = None
        cover = entry.xpath(xp_template.format('Picture'))
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)

        pub_year = entry.xpath(xp_template.format('Year'))
        if pub_year:
            mi.pubdate = toPubdate(log, pub_year)
            #log.debug('pubdate %s'%mi.pubdate)

        rating = entry.xpath(xp_template.format('ClientRatingValue'))
        if rating:
            try:
                #'rating',     A floating point number between 0 and 10
                # OZON raion N of 5, calibre of 10, but there is a bug? in identify
                mi.rating = float(rating)
            except:
                pass
            rating
        return mi

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        ozon_id = identifiers.get('ozon', None)
        if ozon_id is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                ozon_id = self.cached_isbn_to_identifier(isbn)
        if ozon_id is not None:
            url = self.cached_identifier_to_cover_url(ozon_id)
        return url

    # }}}

    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=None,
                       identifiers={},
                       timeout=30,
                       get_best_cover=False):  # {{{
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.debug('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break

        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return

        log.debug('Downloading cover from:', cached_url)
        try:
            cdata = self.browser.open_novisit(cached_url,
                                              timeout=timeout).read()
            if cdata:
                result_queue.put((self, cdata))
        except Exception as e:
            log.exception(u'Failed to download cover from: %s' % cached_url)
            return as_unicode(e)

    # }}}

    def get_book_details(self, log, metadata, timeout):  # {{{
        from lxml import html, etree
        from calibre.ebooks.chardet import xml_to_unicode

        url = self.get_book_url(metadata.get_identifiers())[2]

        raw = self.browser.open_novisit(url, timeout=timeout).read()
        doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])

        xpt_tmpl_base = u'//text()[starts-with(translate(normalize-space(.), " \t", ""), "%s")]'
        xpt_tmpl_a = u'normalize-space(' + xpt_tmpl_base + u'/following-sibling::a[1]/@title)'

        # series Серия/Серии
        series = doc.xpath(xpt_tmpl_a % u'Сери')
        if series:
            metadata.series = series
        #log.debug(u'Seria: ', metadata.series)

        xpt_isbn = u'normalize-space(' + xpt_tmpl_base + u')'
        isbn_str = doc.xpath(xpt_isbn % u'ISBN')
        if isbn_str:
            #log.debug(u'ISBNS: ', self.isbnRegex.findall(isbn_str))
            all_isbns = [
                check_isbn(isbn) for isbn in self.isbnRegex.findall(isbn_str)
                if _verifyISBNIntegrity(log, isbn)
            ]
            if all_isbns:
                metadata.all_isbns = all_isbns
                metadata.isbn = all_isbns[0]
        #log.debug(u'ISBN: ', metadata.isbn)

        publishers = doc.xpath(xpt_tmpl_a % u'Издатель')
        if publishers:
            metadata.publisher = publishers
        #log.debug(u'Publisher: ', metadata.publisher)

        xpt_lang = u'substring-after(normalize-space(//text()[contains(normalize-space(.), "%s")]), ":")'
        displ_lang = None
        langs = doc.xpath(xpt_lang % u'Язык')
        if langs:
            lng_splt = langs.split(u',')
            if lng_splt:
                displ_lang = lng_splt[0].strip()
        metadata.language = _translageLanguageToCode(displ_lang)
        #log.debug(u'Language: ', metadata.language)

        # can be set before from xml search responce
        if not metadata.pubdate:
            xpt = u'substring-after(' + xpt_isbn + u',";")'
            yearIn = doc.xpath(xpt % u'ISBN')
            if yearIn:
                matcher = re.search(r'\d{4}', yearIn)
                if matcher:
                    metadata.pubdate = toPubdate(log, matcher.group(0))
        #log.debug(u'Pubdate: ', metadata.pubdate)

        # overwrite comments from HTML if any
        xpt = u'//*[@id="detail_description"]//*[contains(text(), "От производителя")]/../node()[not(self::comment())][not(self::br)][preceding::*[contains(text(), "От производителя")]]'  # noqa
        from lxml.etree import ElementBase
        comment_elem = doc.xpath(xpt)
        if comment_elem:
            comments = u''
            for node in comment_elem:
                if isinstance(node, ElementBase):
                    comments += unicode(etree.tostring(node, encoding=unicode))
                elif isinstance(node, basestring) and node.strip():
                    comments += unicode(node) + u'\n'
            if comments and (not metadata.comments
                             or len(comments) > len(metadata.comments)):
                metadata.comments = comments
            else:
                log.debug(
                    'HTML book description skipped in favor of search service xml response'
                )
        else:
            log.debug('No book description found in HTML')
Exemplo n.º 11
0
class Douban(Source):

    name = "Douban Books Reload"
    author = "Li Fanxi, xcffl, jnozsc"
    version = (4, 0, 0)
    minimum_calibre_version = (2, 80, 0)

    description = _(
        "Downloads metadata and covers from Douban.com. "
        "Useful only for Chinese language books."
    )

    capabilities = frozenset(["identify", "cover"])
    touched_fields = frozenset(
        [
            "title",
            "authors",
            "tags",
            "pubdate",
            "comments",
            "publisher",
            "identifier:isbn",
            "rating",
            "identifier:douban",
        ]
    )  # language currently disabled
    supports_gzip_transfer_encoding = True
    cached_cover_url_is_reliable = True

    DOUBAN_API_URL = "https://api.douban.com/v2/book/search"
    DOUBAN_BOOK_URL = "https://book.douban.com/subject/%s/"

    options = (
        Option(
            "include_subtitle_in_title",
            "bool",
            True,
            _("Include subtitle in book title:"),
            _("Whether to append subtitle in the book title."),
        ),
        Option(
            "apikey", "string", "", _("douban api v2 apikey"), _("douban api v2 apikey")
        ),
    )

    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        douban_id = entry_.get("id")
        title = entry_.get("title")
        description = entry_.get("summary")
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get("publisher")
        isbn = entry_.get("isbn13")  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get("pubdate")
        authors = entry_.get("author")
        book_tags = entry_.get("tags")
        rating = entry_.get("rating")
        cover_url = entry_.get("images", {}).get("large")
        series = entry_.get("series")

        if not authors:
            authors = [_("Unknown")]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {"douban": douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(""), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = [tag["name"] for tag in book_tags]

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
            except:
                log.error("Failed to parse pubdate %r" % pubdate)

        # Ratings
        if rating:
            try:
                mi.rating = float(rating["average"]) / 2.0
            except:
                log.exception("Failed to parse rating")
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find("book-default") == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series["title"]

        return mi

    # }}}

    def get_book_url(self, identifiers):  # {{{
        db = identifiers.get("douban", None)
        if db is not None:
            return ("douban", db, self.DOUBAN_BOOK_URL % db)

    # }}}

    def create_query(self, log, title=None, authors=None, identifiers={}):  # {{{
        try:
            from urllib.parse import urlencode
        except ImportError:
            from urllib import urlencode
        SEARCH_URL = "https://api.douban.com/v2/book/search?count=10&"
        ISBN_URL = "https://api.douban.com/v2/book/isbn/"
        SUBJECT_URL = "https://api.douban.com/v2/book/"

        q = ""
        t = None
        isbn = check_isbn(identifiers.get("isbn", None))
        subject = identifiers.get("douban", None)
        if isbn is not None:
            q = isbn
            t = "isbn"
        elif subject is not None:
            q = subject
            t = "subject"
        elif title or authors:

            def build_term(prefix, parts):
                return " ".join(x for x in parts)

            title_tokens = list(self.get_title_tokens(title))
            if title_tokens:
                q += build_term("title", title_tokens)
            author_tokens = list(
                self.get_author_tokens(authors, only_first_author=True)
            )
            if author_tokens:
                q += (" " if q != "" else "") + build_term("author", author_tokens)
            t = "search"
        q = q.strip()
        # if isinstance(q, type("")):
        #    q = q.encode("utf-8")
        q = str(q)
        if not q:
            return None
        url = None
        if t == "isbn":
            url = ISBN_URL + q
        elif t == "subject":
            url = SUBJECT_URL + q
        else:
            url = SEARCH_URL + urlencode(
                {
                    "q": q,
                }
            )
        if self.prefs.get("apikey"):
            if t == "isbn" or t == "subject":
                url = url + "?apikey=" + self.prefs["apikey"]
            else:
                url = url + "&apikey=" + self.prefs["apikey"]
        return url

    # }}}

    def download_cover(
        self,
        log,
        result_queue,
        abort,  # {{{
        title=None,
        authors=None,
        identifiers={},
        timeout=30,
        get_best_cover=False,
    ):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info("No cached cover found, running identify")
            rq = Queue()
            self.identify(
                log, rq, abort, title=title, authors=authors, identifiers=identifiers
            )
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(
                key=self.identify_results_keygen(
                    title=title, authors=authors, identifiers=identifiers
                )
            )
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info("No cover found")
            return

        if abort.is_set():
            return
        br = self.browser
        log("Downloading cover from:", cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            if cdata:
                result_queue.put((self, cdata))
        except:
            log.exception("Failed to download cover from:", cached_url)

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        db = identifiers.get("douban", None)
        if db is None:
            isbn = identifiers.get("isbn", None)
            if isbn is not None:
                db = self.cached_isbn_to_identifier(isbn)
        if db is not None:
            url = self.cached_identifier_to_cover_url(db)

        return url

    # }}}

    def get_all_details(self, br, log, entries, abort, result_queue, timeout):  # {{{
        for relevance, i in enumerate(entries):
            try:
                ans = self.to_metadata(br, log, i, timeout)
                if isinstance(ans, Metadata):
                    ans.source_relevance = relevance
                    db = ans.identifiers["douban"]
                    for isbn in getattr(ans, "all_isbns", []):
                        self.cache_isbn_to_identifier(isbn, db)
                    if ans.has_douban_cover:
                        self.cache_identifier_to_cover_url(db, ans.has_douban_cover)
                    self.clean_downloaded_metadata(ans)
                    result_queue.put(ans)
            except:
                log.exception("Failed to get metadata for identify entry:", i)
            if abort.is_set():
                break

    # }}}

    def identify(
        self,
        log,
        result_queue,
        abort,
        title=None,
        authors=None,  # {{{
        identifiers={},
        timeout=30,
    ):

        # check apikey
        if not self.prefs.get("apikey"):
            return

        import json

        query = self.create_query(
            log, title=title, authors=authors, identifiers=identifiers
        )
        if not query:
            log.error("Insufficient metadata to construct query")
            return
        br = self.browser
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception as e:
            log.exception("Failed to make identify query: %r" % query)
            return as_unicode(e)
        try:
            j = json.loads(raw)
        except Exception as e:
            log.exception("Failed to parse identify results")
            return as_unicode(e)
        if "books" in j:
            entries = j["books"]
        else:
            entries = []
            entries.append(j)
        if not entries and identifiers and title and authors and not abort.is_set():
            return self.identify(
                log, result_queue, abort, title=title, authors=authors, timeout=timeout
            )
        # There is no point running these queries in threads as douban
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(br, log, entries, abort, result_queue, timeout)

        return None
Exemplo n.º 12
0
class GoodreadsAPI(Source):
    """
    Goodreads API
    """
    name = 'GoodreadsAPI'
    description = 'GoodreadsAPI'
    author = 'botmtl'
    version = (0, 0, 2)
    minimum_calibre_version = (0, 8, 1)
    capabilities = frozenset(['identify'])
    has_html_comments = True
    supports_gzip_transfer_encoding = True
    BASE_URL = 'https://www.goodreads.com'
    ISBN_TO_BOOKID = 'https://www.goodreads.com/book/isbn_to_id/{0}?key={1}'
    BOOK_SHOW = 'https://www.goodreads.com/book/show/{0}.xml?key={1}'
    BOOK_SHOW_ISBN = 'https://www.goodreads.com/book/isbn/{0}.xml?key={1}'
    # name, type_, default, label, desc, choices=None
    options = [
        Option(name='GOODREADS_API_KEY',
               type_='string',
               default='',
               label='GOODREADS_API_KEY',
               desc='GOODREADS_API_KEY'),
        Option(
            name='SHELF_COUNT_THRESHOLD',
            type_='number',
            default=2,
            label='SHELF_COUNT_THRESHOLD:',
            desc=
            'How many shelves does this book have to be in to be considered a tag.'
        ),
        Option(name='NEVER_REPLACE_AMAZONID',
               type_='bool',
               default=True,
               label='NEVER_REPLACE_AMAZONID:',
               desc='NEVER_REPLACE_AMAZONID'),
        Option(name='NEVER_REPLACE_ISBN',
               type_='bool',
               default=True,
               label='NEVER_REPLACE_ISBN:',
               desc='NEVER_REPLACE_ISBN'),
        Option(name='CHECK_AMAZONID_VALIDITY',
               type_='bool',
               default=True,
               label='CHECK_AMAZONID_VALIDITY:',
               desc='Not Implemented.'),
        Option(
            name='ADD_THESE_TAGS',
            type_='string',
            default='GoodreadsAPI',
            label='Additioal tags:',
            desc=
            'A comma separated list of tags to add on a sucessful metadata download.'
        ),
        Option(
            u'DISABLE_TITLE_AUTHOR_SEARCH', u'bool', False,
            u'Disable title/author search:',
            u'Only books with identifiers will have a chance for to find a match with the metadata provider.'
        )
    ]

    def __init__(self, *args, **kwargs):
        """
        Args:
            args:
            kwargs:
        """
        self.touched_fields = frozenset([
            'title', 'authors', 'identifier:goodreads', 'identifier:amazon',
            'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate',
            'tags', 'series'
        ])
        Source.__init__(self, *args, **kwargs)

    def is_configured(self):
        # type: () -> bool
        """
        :return: False if your plugin needs to be configured before it can be used. For example, it might need a username/password/API key.
        :rtype: bool
        """
        if self.prefs['GOODREADS_API_KEY']:
            return True

        return False

    def get_cached_cover_url(self, identifiers):
        """
        :param identifiers: list(unicode) or list(str)
        :return: Text: url
        """
        url = None
        if identifiers.get('goodreads'):
            url = self.cached_identifier_to_cover_url(
                identifiers.get('goodreads'))

        return url

    def clean_downloaded_metadata(self, mi):
        """
        Overridden from the calibre default so that we can stop this plugin messing
        with the tag casing coming from Goodreads
        """
        series_in_title = r'\s*{0}\s*#?{1}\s*'.format(mi.series,
                                                      mi.series_index)
        if mi.title:
            mi.title = re.sub(series_in_title + r'[:-]',
                              r'',
                              mi.title,
                              flags=re.IGNORECASE).strip()
            mi.title = re.sub(r'(?:[^:-]+)[:-]' + series_in_title,
                              r'',
                              mi.title,
                              flags=re.IGNORECASE).strip()
            mi.title = re.sub(r'\(.*?\)', r'', mi.title,
                              flags=re.IGNORECASE).strip()
            mi.title = re.sub(r'\[.*?\]', r'', mi.title,
                              flags=re.IGNORECASE).strip()
            mi.title = fixcase(mi.title)
            mi.title = mi.title.strip()

        if mi.authors:
            mi.authors = fixauthors(mi.authors)
            try:
                plugin_prefs = JSONConfig('plugins/Quality Check')
                from calibre_plugins.quality_check.config import STORE_OPTIONS, KEY_AUTHOR_INITIALS_MODE, AUTHOR_INITIALS_MODES
                initials_mode = plugin_prefs[STORE_OPTIONS].get(
                    KEY_AUTHOR_INITIALS_MODE, u'A. B.')
                from calibre_plugins.quality_check.helpers import get_formatted_author_initials
                mi.authors = [
                    get_formatted_author_initials(initials_mode, author)
                    for author in mi.authors
                ]
            except:
                pass

    def _autocomplete_api(self, search_terms, timeout=10):
        # type: (Text, int) -> dict or None
        """
        :param timeout: int: urlopen will raise an exception
        :param search_terms: unicode: search term(s)
        :return: dict: a dictionnary representing the first book found by the api.
        """
        from urllib2 import urlopen
        import json
        search_terms = search_terms.strip()
        if search_terms is None: return None
        search_terms = search_terms.replace(' and ',
                                            ' ').replace(' or ', ' ').replace(
                                                ' & ', ' ').replace('-', ' ')
        search_terms = search_terms.replace('  ', ' ')
        search_terms = search_terms.strip().replace(' ', '+')
        autocomplete_api_url = "https://www.goodreads.com/book/auto_complete?format=json&q="
        self.log.info('autocomplete url:', autocomplete_api_url, search_terms)
        response = urlopen(autocomplete_api_url + search_terms,
                           timeout=timeout).read()
        if response is not None:
            result = json.loads(response)
            if len(result) >= 1:
                return result[0]['bookId']
        return None

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers=None,
                 timeout=30):
        """

        :param log:
        :param result_queue:
        :param abort:
        :param title:
        :param authors:
        :param identifiers:
        :param timeout:
        :return:
        """
        if not identifiers: identifiers = {}
        goodreads_id = None
        # noinspection PyAttributeOutsideInit
        self.log = log
        if identifiers.get('amazon'):
            try:
                self.log.info('ISBN_TO_BOOKID', identifiers.get('amazon'))
                request = GoodreadsAPI.ISBN_TO_BOOKID.format(
                    identifiers.get('amazon'), self.prefs['GOODREADS_API_KEY'])
                goodreads_id = urlopen(request).read()
            except:
                pass
        if not goodreads_id and identifiers.get('goodreads'):
            goodreads_id = identifiers.get('goodreads')
        if not goodreads_id and identifiers.get('isbn'):
            try:
                self.log.info('ISBN_TO_BOOKID', identifiers.get('isbn'))
                request = GoodreadsAPI.ISBN_TO_BOOKID.format(
                    identifiers.get('isbn'), self.prefs['GOODREADS_API_KEY'])
                goodreads_id = urlopen(request).read()
            except:
                pass

        if not goodreads_id and title and not self.prefs[
                'DISABLE_TITLE_AUTHOR_SEARCH']:
            self.log.info(
                'AUTOCOMPLETEAPI:', ' '.join(self.get_title_tokens(title)) +
                ' ' + ' '.join(self.get_author_tokens(authors)))
            goodreads_id = self._autocomplete_api(
                ' '.join(self.get_title_tokens(title)) + ' ' +
                ' '.join(self.get_author_tokens(authors)), 10)

        if goodreads_id:
            try:
                self.log.info('BOOK_SHOW ', goodreads_id)
                request_book = GoodreadsAPI.BOOK_SHOW.format(
                    goodreads_id, self.prefs['GOODREADS_API_KEY'])
                response = urlopen(request_book).read()
                response = re.sub(re.compile(r'>\s+<', re.MULTILINE), '><',
                                  response)
                response = re.sub(re.compile(r'\r\n', re.MULTILINE), r'',
                                  response)
                mi = self._GoodreadsBook_to_Metadata(
                    _GoodreadsBook(str(response),
                                   self.prefs['SHELF_COUNT_THRESHOLD']))
            except Exception as e:
                self.log.error(e.message)
                self.log.error(traceback.print_stack())
                traceback.print_exc()
                return

            self.clean_downloaded_metadata(mi)
            result_queue.put(mi)

        return None

    def _GoodreadsBook_to_Metadata(self, book):
        # type: (_GoodreadsBook) -> Metadata
        """
        :param book: _GoodreadsBook: book
        :return: Metadata: Metadata
        """
        mi = Metadata(book.title, book.authors)
        mi.source_relevance = 0
        mi.set_identifier('goodreads', book.id)

        if self.prefs['NEVER_REPLACE_ISBN'] and mi.get_identifiers().get(
                'isbn'):
            mi.set_identifier('isbn', '')

        if book.asin and not self.prefs['NEVER_REPLACE_AMAZONID']:
            mi.set_identifier('amazon', book.asin)

        if book.isbn and not self.prefs['NEVER_REPLACE_ISBN']:
            try:
                if len(book.isbn) == 10:
                    mi.isbn = check_isbn13(_ISBNConvert.convert(book.isbn))
                else:
                    mi.isbn = check_isbn13(book.isbn)
            except:
                self.log.error("ISBN CONVERSION ERROR:", book.isbn)
                self.log.exception()

        if book.image_url:
            self.log.info('cache_identifier_to_cover_url:', book.asin, ':',
                          book.image_url)
            self.cache_identifier_to_cover_url(book.id, book.image_url)

        if book.publisher:
            self.log.info('book.publisher is:', book.publisher)
            mi.publisher = book.publisher

        if book.pubdate:
            self.log.info('book.pubdate is:',
                          book.pubdate.strftime('%Y-%m-%d'))
            mi.pubdate = book.pubdate

        if book.comments:
            self.log.info('book.editorial_review is:', book.comments)
            mi.comments = book.comments

        tags = self.prefs['ADD_THESE_TAGS'].split(',')
        tags.extend(book.tags)
        # tag_mappings = JSONConfig('plugins/GenreMappings')['genreMappings']
        # mi.tags = list(set(sorted(filter(lambda x: tag_mappings.get(x, x), tags))))

        if book.series:
            mi.series = book.series
            self.log.info(u'series:', book.series)
            if book.series_index:
                mi.series_index = book.series_index
                self.log.info(u'series_index:',
                              "{0:.2f}".format(book.series_index))
            else:
                mi.series_index = 0

        if book.average_rating:
            mi.rating = book.average_rating

        self.clean_downloaded_metadata(mi)

        return mi

    def cli_main(self, args):
        """
        :type args: list
        :param args: args
        """
        pass

    # noinspection PyDefaultArgument
    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=[],
                       identifiers={},
                       timeout=30,
                       get_best_cover=False):
        # type: (ThreadSafeLog, Queue, Event, Text, list(), dict(), int, bool) -> Text
        """
        Download a cover and put it into result_queue. The parameters all have
        the same meaning as for :meth:`identify`. Put (self, cover_data) into
        result_queue.

        This method should use cached cover URLs for efficiency whenever
        possible. When cached data is not present, most plugins simply call
        identify and use its results.

        If the parameter get_best_cover is True and this plugin can get
        multiple covers, it should only get the best one.
        :type result_queue: Queue
        :param log: ThreadSafeLog: log
        :param result_queue: Queue: results
        :param abort: Event: if is_set,abort
        :param title: Optional[unicode]: title
        :param authors: Optional[List]: authors
        :param timeout: int: timeout
        :param get_best_cover: bool:cover
        :return:
        :type identifiers: Optional[Dict]: identifiers
        """
        # noinspection PyAttributeOutsideInit
        self.log = log
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            self.log.info(u'No cached cover found, running identify')
            try:
                rq = Queue()
                self.identify(self.log, rq, abort, title, authors, identifiers)
                cached_url = self.get_cached_cover_url(identifiers)
                if cached_url is None:
                    return u'Download cover failed.  Could not identify.'
            except Exception as e:
                return e.message

        if abort.is_set():
            return "abort"

        br = self.browser
        self.log.info(u'Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            self.log.error(u'Failed to download cover from:', cached_url)
            return u'Failed to download cover from:%s' % cached_url  # }}}
Exemplo n.º 13
0
class MySource(Source):
    options = [
        Option(
            'clean_title', 'bool', True, _('Clean title'),
            _('Enable this option clean title metadata and make it "Title Case".'
              ))
    ]

    # Plugin Options
    has_html_comments = True
    supports_gzip_transfer_encoding = False

    # My Options
    idkey = None
    maxresults = 5
    sleep_time = 0.5
    worker_class = None
    abstract_title = None

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=30):

        md = self.worker_class(self.browser, timeout)

        d = {}
        idval = identifiers.get(self.idkey, None)
        isbn = identifiers.get('isbn', None)

        if idval: d['id'] = idval
        if isbn: d['isbn'] = isbn
        if title: d['title'] = title
        if authors: d['authors'] = authors

        md.query(d, maxresults=self.maxresults)

        while not abort.is_set():
            md.join(0.2)
            if abort.is_set(): break
            if not md.is_alive(): break

        time.sleep(self.sleep_time)

        if not abort.is_set():
            for i in range(0, len(md.ans)):
                mi = self.data2mi(md.ans[i])
                mi.source_relevance = i  # Less means more relevant.
                mi.isbn = check_isbn(mi.isbn)

                result_queue.put(mi)
        return None

    def identify_results_keygen(self,
                                title=None,
                                authors=None,
                                identifiers={}):
        """ Returns a key to sort search results. Lesser value means more relevance."""

        query = dict([('title', title), ('authors', authors)] +
                     identifiers.items())

        def mi_distance(mi):
            mifields = dict([('title', mi.title), ('authors', mi.authors)] +
                            mi.identifiers.items())
            return metadata_distance(query, mifields, idkey=self.idkey)

        return mi_distance

    def data2mi(self, item):
        """Converts a single metadata answer in the form of a dict to a MetadataInformation object"""

        mi = Metadata(_('Unknown'))

        # Regular metadata
        mi.title = item.get('title', None)
        mi.authors = item.get('authors', [])
        mi.publisher = item.get('publisher', None)

        if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id'])
        if 'doi' in item.keys(): mi.set_identifier('doi', item['doi'])
        if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn'])

        if 'updated' in item.keys():
            mi.pubdate = parse_date(item['updated'], assume_utc=True)

        if 'series' in item.keys():
            mi.series = item['series']
            mi.series_index = self.format_series_index(
                item.get('series_index'), None)

        if 'year' in item.keys():
            mi.pubdate = parse_date(item['year'], assume_utc=True)

        if 'abstract' in item.keys():
            mi.comments = self.format_abstract(item['abstract'])

        if 'language' in item.keys(): mi.language = item['language']

        if 'journal' in item.keys():
            mi.series = item['journal']
            mi.series_index = self.format_series_index(item.get('volume'),
                                                       item.get('number'))

        if 'subject' in item.keys():
            tags = set([])
            for s in item['subject']:
                tags.update(msc_tags(s))
                tags.update(arxiv_tags(s))

            mi.tags = list(sorted(tags))

        return mi

    def format_abstract(self, abstract):
        return '<h3>%s</h3>\n %s' % (self.abstract_title, abstract)

    def format_paragraph(self, par):
        par = escape(par)
        par = re.sub(r"{\\it(.*?)}", "<i>\g<1></i>", par)
        par = re.sub("\s+", ' ', par)
        return '<p>%s</p>' % par

    def surname(self, au):
        return author_to_author_sort(au).split(',')[0]

    def format_series_index(self, volume, number):
        """Formats a series index of the form 4.03 indicating number 3 in volume 4."""
        v = 0.0
        n = 0.0

        if volume:
            try:
                v = float(volume)
            except ValueError:
                v = 0.0

        if number:
            try:
                n = float(number)
            except ValueError:
                n = 0.0

        if volume and number: return v + n / 100.
        elif volume: return v
        elif number: return n
        else: return 0.
Exemplo n.º 14
0
class Antikvarium_hu(Source):
    name = 'Antikvarium_hu'
    description = _('Downloads metadata and cover from antikvarium.hu')
    author = 'Hoffer Csaba & Kloon & otapi'
    version = (2, 0, 3)
    minimum_calibre_version = (0, 8, 0)

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'identifier:isbn', 'identifier:antik_hu', 'tags',
        'comments', 'publisher', 'pubdate', 'series', 'language', 'languages'
    ])
    has_html_comments = False
    supports_gzip_transfer_encoding = False

    KEY_MAX_DOWNLOADS = 'maxDownloads'

    options = [
        Option(
            KEY_MAX_DOWNLOADS, 'number', 3,
            _('Maximum number of books to get'),
            _('The maximum number of books to process from the Antikvarium search result'
              )),
    ]

    BASE_URL = 'https://www.antikvarium.hu'
    BOOK_URL = BASE_URL + '/konyv/'

    def create_query(self, log, title=None, authors=None, identifiers={}):
        if title is not None:
            search_title = urllib.quote(title.encode('utf-8'))
        else:
            search_title = ''
        log.info(' Title: %s' % search_title)

        if authors is not None:
            search_author = urllib.quote(authors[0].encode('utf-8'))
        else:
            search_author = ''
        log.info(' Author: %s' % search_author)

        search_page = "https://www.antikvarium.hu/index.php?type=search&kc=%s&sz=%s&he=0&jk=0&reszletes=1&rend=kiadasevecsokk&oldaldb=60&kapelol=0&nezet=li&elist=egyebadat&interfaceid=102&oldalcount=1" % (
            search_title, search_author)
        return search_page

    def get_cached_cover_url(self, identifiers):
        url = None
        antik_id = identifiers.get('antik_hu', None)
        if antik_id is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                antik_id = self.cached_isbn_to_identifier(isbn)
        if antik_id is not None:
            url = self.cached_identifier_to_cover_url(antik_id)
        return url

    def cached_identifier_to_cover_url(self, id_):
        with self.cache_lock:
            url = self._get_cached_identifier_to_cover_url(id_)
            if not url:
                # Try for a "small" image in the cache
                url = self._get_cached_identifier_to_cover_url('small/' + id_)
            return url

    def _get_cached_identifier_to_cover_url(self, id_):
        # This must only be called once we have the cache lock
        url = self._identifier_to_cover_url_cache.get(id_, None)
        if not url:
            # We could not get a url for this particular B&N id
            # However we might have one for a different isbn for this book
            # Barnes & Noble are not very consistent with their covers and
            # it could be that the particular ISBN we chose does not have
            # a large image but another ISBN we retrieved does.
            key_prefix = id_.rpartition('/')[0]
            for key in self._identifier_to_cover_url_cache.keys():
                if key.startswith('key_prefix'):
                    return self._identifier_to_cover_url_cache[key]
        return url

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title,
                 authors,
                 identifiers={},
                 timeout=30):
        '''
		Note this method will retry without identifiers automatically if no
		match is found with identifiers.
		'''

        matches = []
        antik_id = identifiers.get('antik_hu', None)
        isbn = check_isbn(identifiers.get('isbn', None))
        br = browser()
        log.info(u'\nTitle:%s\nAuthors:%s\n' % (title, authors))
        if antik_id:
            matches.append('%s%s' % (Antikvarium_hu.BOOK_URL, antik_id))
        else:
            if isbn:
                matches.append(
                    'https://www.antikvarium.hu/index.php?type=search&isbn=%s'
                    % (isbn))
            else:
                query = self.create_query(log,
                                          title=title,
                                          authors=authors,
                                          identifiers=identifiers)
                if query is None:
                    log.error('Insufficient metadata to construct query')
                    return
                try:
                    log.info('Querying: %s' % query)
                    response = br.open(query)
                except Exception as e:
                    if isbn and callable(getattr(e, 'getcode',
                                                 None)) and e.getcode() == 404:
                        # We did a lookup by ISBN but did not find a match
                        # We will fallback to doing a lookup by title author
                        log.info('Failed to find match for ISBN: %s' % isbn)
                    else:
                        err = 'Failed to make identify query: %r' % query
                        log.exception(err)
                        return as_unicode(e)

                try:
                    raw = response.read().strip()
                    raw = raw.decode('utf-8', errors='replace')
                    if not raw:
                        log.error('Failed to get raw result for query: %r' %
                                  query)
                        return
                    root = fromstring(clean_ascii_chars(raw))
                except:
                    msg = 'Failed to parse Antikvarium.hu page for query: %r' % query
                    log.exception(msg)
                    return msg
                self._parse_search_results(log, title, authors, root, matches,
                                           timeout)

        if abort.is_set():

            return
        if not matches:
            if identifiers and title and authors:
                log.info(
                    'No matches found with identifiers, retrying using only'
                    ' title and authors')
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=title,
                                     authors=authors,
                                     timeout=timeout)
            log.error('No matches found with query: %r' % query)
            return
        from calibre_plugins.antikvarium_hu.worker import Worker
        workers = [
            Worker(url, result_queue, br, log, i, self)
            for i, url in enumerate(matches)
        ]

        for w in workers:
            w.start()
            # Don't send all requests at the same time
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)

                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:

                break
        return None

    def _parse_search_results(self, log, title, authors, root, matches,
                              timeout):
        results = root.xpath('//*[@class="book-data-holder-list"]')

        max_results = self.prefs[Antikvarium_hu.KEY_MAX_DOWNLOADS]
        for result in results:
            urls = result.xpath('//*[@id="searchResultKonyvCim-listas"]/@href')
            book_url = 'https://www.antikvarium.hu/' + urls[0]
            log.info('Book URL: %r' % book_url)

            titlenode = result.xpath(
                '//*[@id="searchResultKonyvCim-listas"]/span')[0]
            n_title = '%s' % titlenode.text_content()
            log.info('Book title: %s' % n_title)

            authorenode = result.xpath(
                '//*[@id="searchResultKonyvSzerzo-listas"]')[0]
            etree.strip_tags(authorenode, 'snap')
            n_author = '%s' % authorenode.text_content()
            log.info('Book author: %s' % n_author)

            if title:
                if title.lower() not in n_title.lower() and self.strip_accents(
                        title) not in self.strip_accents(n_title):
                    continue

            if authors:
                author1 = authors[0]
                authorsplit = author1.split(" ")
                author2 = author1
                if len(authorsplit) > 1:
                    author2 = '%s %s' % (authorsplit[1], authorsplit[0])
                log.info('author1: %s' % author1)
                log.info('n_author: %s' % n_author)
                log.info('author2: %s' % author2)
                if author1.lower() not in n_author.lower() \
                and self.strip_accents(author1) not in self.strip_accents(n_author) \
                and author2.lower() not in n_author.lower() \
                and self.strip_accents(author2) not in self.strip_accents(n_author):
                    continue

            matches.append(book_url)
            if len(matches) >= max_results:
                return

    def strip_accents(self, s):
        symbols = (u"öÖüÜóÓőŐúÚéÉáÁűŰíÍ", u"oOuUoOoOuUeEaAuUiI")

        tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)])

        return s.translate(tr).lower()

    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=None,
                       identifiers={},
                       timeout=30):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return
        br = self.browser
        log.info('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
Exemplo n.º 15
0
class Ozon(Source):
    name = 'OZON.ru'
    description = _('Downloads metadata and covers from OZON.ru (updated)')

    capabilities = frozenset(['identify', 'cover'])

    touched_fields = frozenset([
        'title', 'authors', 'identifier:isbn', 'identifier:ozon', 'publisher',
        'pubdate', 'comments', 'series', 'rating', 'languages'
    ])
    # Test purpose only, test function does not like when sometimes some filed are empty
    # touched_fields = frozenset(['title', 'authors', 'identifier:isbn', 'identifier:ozon',
    #                          'publisher', 'pubdate', 'comments'])

    supports_gzip_transfer_encoding = True
    has_html_comments = True

    ozon_url = 'http://www.ozon.ru'

    # match any ISBN10/13. From "Regular Expressions Cookbook"
    isbnPattern = r'(?:ISBN(?:-1[03])?:? )?(?=[-0-9 ]{17}|' \
                  '[-0-9X ]{13}|[0-9X]{10})(?:97[89][- ]?)?[0-9]{1,5}[- ]?' \
                  '(?:[0-9]+[- ]?){2}[0-9X]'
    isbnRegex = re.compile(isbnPattern)

    optkey_strictmatch = 'strict_result_match'
    options = (Option(
        optkey_strictmatch, 'bool', False,
        _('Filter out less relevant hits from the search results'),
        _('Improve search result by removing less relevant hits. It can be useful to refine the search when there are many matches'
          )), )

    def get_book_url(self, identifiers):  # {{{
        import urllib2
        ozon_id = identifiers.get('ozon', None)
        res = None
        if ozon_id:
            # no affiliateId is used in search/detail
            url = '{}/context/detail/id/{}'.format(self.ozon_url,
                                                   urllib2.quote(ozon_id),
                                                   _get_affiliateId())
            res = ('ozon', ozon_id, url)
        return res

    # }}}

    def create_query(self,
                     log,
                     title=None,
                     authors=None,
                     identifiers={}):  # {{{
        from urllib import quote_plus

        # div_book -> search only books, ebooks and audio books
        search_url = self.ozon_url + '/?context=search&group=div_book&text='

        # for ozon.ru search we have to format ISBN with '-'
        isbn = _format_isbn(log, identifiers.get('isbn', None))
        if isbn and '-' not in isbn:
            log.error(
                "%s requires formatted ISBN for search. %s cannot be formated - removed. (only Russian ISBN format is supported now)"
                % (self.name, isbn))
            isbn = None

        ozonid = identifiers.get('ozon', None)

        qItems = set([ozonid, isbn])

        # Added Russian variant of 'Unknown'
        unk = [_('Unknown').upper(), 'Неизв.'.upper(), icu_upper('Неизв.')]

        if title and title not in unk:
            qItems.add(title)

        if authors:
            for auth in authors:
                if icu_upper(auth) not in unk:
                    qItems.add(auth)

        qItems.discard(None)
        qItems.discard('')
        searchText = u' '.join(qItems).strip()

        if isinstance(searchText, unicode):
            searchText = searchText.encode('utf-8')
        if not searchText:
            return None

        search_url += quote_plus(searchText)
        log.debug(u'search url: %r' % search_url)
        return search_url

    # }}}

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=90):  # {{{
        from calibre.ebooks.chardet import xml_to_unicode
        from HTMLParser import HTMLParser
        from lxml import etree, html

        if not self.is_configured():
            return
        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers)
        if not query:
            err = u'Insufficient metadata to construct query'
            log.error(err)
            return err

        try:
            raw = self.browser.open_novisit(query).read()
        except Exception as e:
            log.exception(u'Failed to make identify query: %r' % query)
            return as_unicode(e)

        try:
            doc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
            entries_block = doc.xpath(u'//div[@class="bSearchResult"]')

            if entries_block:
                entries = doc.xpath(
                    u'//div[contains(@itemprop, "itemListElement")]')
                # for entry in entries:
                #   log.debug('entries %s' % entree.tostring(entry))
                metadata = self.get_metadata(log, entries, title, authors,
                                             identifiers)
                self.get_all_details(log, metadata, abort, result_queue,
                                     identifiers, timeout)
            else:
                # Redirect page: trying to extract ozon_id from javascript data
                h = HTMLParser()
                entry_string = (h.unescape(
                    unicode(etree.tostring(doc, pretty_print=True))))
                id_title_pat = re.compile(
                    u'products":\[{"id":(\d{7}),"name":"([а-яА-Я :\-0-9]+)')
                # result containing ozon_id and entry_title
                entry_info = re.search(id_title_pat, entry_string)
                ozon_id = entry_info.group(1) if entry_info else None
                entry_title = entry_info.group(2) if entry_info else None

                if ozon_id:
                    metadata = self.to_metadata_for_single_entry(
                        log, ozon_id, entry_title, authors)
                    identifiers['ozon'] = ozon_id
                    self.get_all_details(log, [metadata],
                                         abort,
                                         result_queue,
                                         identifiers,
                                         timeout,
                                         cachedPagesDict={})
                else:
                    log.error('No SearchResults in Ozon.ru response found')

        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)

    # }}}

    def to_metadata_for_single_entry(self, log, ozon_id, title,
                                     authors):  # {{{

        # parsing javascript data from the redirect page
        mi = Metadata(title, authors)
        mi.identifiers = {'ozon': ozon_id}

        return mi

    # }}}

    def get_metadata(self, log, entries, title, authors, identifiers):  # {{{
        # some book titles have extra characters like this

        reRemoveFromTitle = re.compile(r'[?!:.,;+-/&%"\'=]')

        title = unicode(title).upper() if title else ''
        if reRemoveFromTitle:
            title = reRemoveFromTitle.sub('', title)
        authors = map(_normalizeAuthorNameWithInitials,
                      map(unicode.upper, map(unicode,
                                             authors))) if authors else None

        ozon_id = identifiers.get('ozon', None)
        # log.debug(u'ozonid: ', ozon_id)

        unk = unicode(_('Unknown')).upper()

        if title == unk:
            title = None

        if authors == [unk] or authors == []:
            authors = None

        def in_authors(authors, miauthors):
            for author in authors:
                for miauthor in miauthors:
                    # log.debug(u'=> %s <> %s'%(author, miauthor))
                    if author in miauthor:
                        return True
            return None

        def calc_source_relevance(mi):  # {{{
            relevance = 0
            if title:
                mititle = unicode(mi.title).upper() if mi.title else ''

                if reRemoveFromTitle:
                    mititle = reRemoveFromTitle.sub('', mititle)

                if title in mititle:
                    relevance += 3
                elif mititle:
                    # log.debug(u'!!%s!'%mititle)
                    relevance -= 3
            else:
                relevance += 1

            if authors:
                miauthors = map(unicode.upper, map(
                    unicode, mi.authors)) if mi.authors else []
                # log.debug('Authors %s vs miauthors %s'%(','.join(authors), ','.join(miauthors)))

                if (in_authors(authors, miauthors)):
                    relevance += 3
                elif u''.join(miauthors):
                    # log.debug(u'!%s!'%u'|'.join(miauthors))
                    relevance -= 3
            else:
                relevance += 1

            if ozon_id:
                mozon_id = mi.identifiers['ozon']
                if ozon_id == mozon_id:
                    relevance += 100

            if relevance < 0:
                relevance = 0
            return relevance

        # }}}

        strict_match = self.prefs[self.optkey_strictmatch]
        metadata = []
        for entry in entries:

            mi = self.to_metadata(log, entry)
            relevance = calc_source_relevance(mi)
            # TODO findout which is really used
            mi.source_relevance = relevance
            mi.relevance_in_source = relevance

            if not strict_match or relevance > 0:
                # getting rid of a random book that shows up in results
                if not (mi.title == 'Unknown'):
                    metadata.append(mi)
                    # log.debug(u'added metadata %s %s.'%(mi.title,  mi.authors))
            else:
                log.debug(
                    u'skipped metadata title: %s, authors: %s. (does not match the query - relevance score: %s)'
                    % (mi.title, u' '.join(mi.authors), relevance))
        return metadata

    # }}}

    def get_all_details(self,
                        log,
                        metadata,
                        abort,
                        result_queue,
                        identifiers,
                        timeout,
                        cachedPagesDict={}):  # {{{

        req_isbn = identifiers.get('isbn', None)

        for mi in metadata:
            if abort.is_set():
                break
            try:
                ozon_id = mi.identifiers['ozon']

                try:
                    self.get_book_details(
                        log, mi, timeout,
                        cachedPagesDict[ozon_id] if cachedPagesDict
                        and ozon_id in cachedPagesDict else None)
                except:
                    log.exception(u'Failed to get details for metadata: %s' %
                                  mi.title)

                all_isbns = getattr(mi, 'all_isbns', [])
                if req_isbn and all_isbns and check_isbn(
                        req_isbn) not in all_isbns:
                    log.debug(u'skipped, no requested ISBN %s found' %
                              req_isbn)
                    continue

                for isbn in all_isbns:
                    self.cache_isbn_to_identifier(isbn, ozon_id)

                if mi.ozon_cover_url:
                    self.cache_identifier_to_cover_url(ozon_id,
                                                       mi.ozon_cover_url)

                self.clean_downloaded_metadata(mi)
                result_queue.put(mi)

            except:
                log.exception(u'Failed to get details for metadata: %s' %
                              mi.title)

    # }}}

    def to_metadata(self, log, entry):  # {{{
        title = unicode(
            entry.xpath(
                u'normalize-space(.//span[@itemprop="name"][1]/text())'))
        # log.debug(u'Title: -----> %s' % title)

        author = unicode(
            entry.xpath(u'normalize-space(.//a[contains(@href, "person")])'))
        # log.debug(u'Author: -----> %s' % author)

        norm_authors = map(_normalizeAuthorNameWithInitials,
                           map(unicode.strip,
                               unicode(author).split(u',')))
        mi = Metadata(title, norm_authors)

        ozon_id = entry.get('data-href').split('/')[-2]

        if ozon_id:
            mi.identifiers = {'ozon': ozon_id}
            # log.debug(u'ozon_id: -----> %s' % ozon_id)

        mi.ozon_cover_url = None
        cover = entry.xpath(u'normalize-space(.//img[1]/@src)')
        log.debug(u'cover: -----> %s' % cover)
        if cover:
            mi.ozon_cover_url = _translateToBigCoverUrl(cover)
            # log.debug(u'mi.ozon_cover_url: -----> %s' % mi.ozon_cover_url)

        pub_year = None
        pub_year_block = entry.xpath(
            u'.//div[@class="bOneTileProperty"]/text()')
        year_pattern = re.compile('\d{4}')
        if pub_year_block:
            pub_year = re.search(year_pattern, pub_year_block[0])
            if pub_year:
                mi.pubdate = toPubdate(log, pub_year.group())
        # log.debug('pubdate %s' % mi.pubdate)

        mi.rating = self.get_rating(log, entry)
        # if not mi.rating:
        #    log.debug('No rating found. ozon_id:%s'%ozon_id)

        return mi

    # }}}

    def get_rating(self, log, entry):  # {{{
        # log.debug(entry)
        ozon_rating = None
        try:
            xp_rating_template = u'boolean(.//div[contains(@class, "bStars") and contains(@class, "%s")])'
            rating = None
            if entry.xpath(xp_rating_template % 'm5'):
                rating = 5.
            elif entry.xpath(xp_rating_template % 'm4'):
                rating = 4.
            elif entry.xpath(xp_rating_template % 'm3'):
                rating = 3.
            elif entry.xpath(xp_rating_template % 'm2'):
                rating = 2.
            elif entry.xpath(xp_rating_template % 'm1'):
                rating = 1.
            if rating:
                # 'rating',     A floating point number between 0 and 10
                # OZON raion N of 5, calibre of 10, but there is a bug? in identify
                ozon_rating = float(rating)
        except:
            pass
        return ozon_rating

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        ozon_id = identifiers.get('ozon', None)
        if ozon_id is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                ozon_id = self.cached_isbn_to_identifier(isbn)
        if ozon_id is not None:
            url = self.cached_identifier_to_cover_url(ozon_id)
        return url

    # }}}

    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=None,
                       identifiers={},
                       timeout=30,
                       get_best_cover=False):  # {{{

        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.debug('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break

        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return

        log.debug('Downloading cover from:', cached_url)
        try:
            cdata = self.browser.open_novisit(cached_url,
                                              timeout=timeout).read()
            if cdata:
                result_queue.put((self, cdata))
        except Exception as e:
            log.exception(u'Failed to download cover from: %s' % cached_url)
            return as_unicode(e)

    # }}}

    def get_book_details(self, log, metadata, timeout, cachedPage):  # {{{
        from lxml import etree, html
        from calibre.ebooks.chardet import xml_to_unicode

        if not cachedPage:
            url = self.get_book_url(metadata.get_identifiers())[2]
            # log.debug(u'book_details_url', url)

            raw = self.browser.open_novisit(url, timeout=timeout).read()
            fulldoc = html.fromstring(xml_to_unicode(raw, verbose=True)[0])
        else:
            fulldoc = cachedPage
            log.debug(u'book_details -> using cached page')

        fullString = etree.tostring(fulldoc)
        doc = fulldoc.xpath(u'//div[@class="bDetailPage"][1]')[0]

        # series Серия/Серии
        series_elem = doc.xpath(u'//div[contains(text(), "Сери")]')
        if series_elem:
            series_text_elem = series_elem[0].getnext()
            metadata.series = series_text_elem.xpath(u'.//a/text()')[0]
            log.debug(u'**Seria: ', metadata.series)

        isbn = None
        isbn_elem = doc.xpath(u'//div[contains(text(), "ISBN")]')
        if isbn_elem:
            isbn = isbn_elem[0].getnext().xpath(u'normalize-space(./text())')
            metadata.identifiers['isbn'] = isbn

        # get authors/editors if no authors are available
        authors_joined = ','.join(metadata.authors)

        if authors_joined == '' or authors_joined == "Unknown":
            authors_from_detail = []
            editor_elem = doc.xpath(u'//div[contains(text(), "Редактор")]')
            if editor_elem:
                editor = editor_elem[0].getnext().xpath(u'.//a/text()')[0]
                authors_from_detail.append(editor + u' (ред.)')
            authors_elem = doc.xpath(u'//div[contains(text(), "Автор")]')
            if authors_elem:
                authors = authors_elem[0].getnext().xpath(
                    u'.//a/text()')  # list
                authors_from_detail.extend(authors)
            if len(authors_from_detail) > 0:
                metadata.authors = authors_from_detail

        cover = doc.xpath('.//img[contains(@class, "fullImage")]/@src')[0]
        metadata.ozon_cover_url = _translateToBigCoverUrl(cover)

        publishers = None
        publishers_elem = doc.xpath(u'//div[contains(text(), "Издатель")]')
        if publishers_elem:
            publishers_elem = publishers_elem[0].getnext()
            publishers = publishers_elem.xpath(u'.//a/text()')[0]

        if publishers:
            metadata.publisher = publishers

        displ_lang = None
        langs = None
        langs_elem = doc.xpath(u'//div[contains(text(), "зык")]')
        if langs_elem:
            langs_elem = langs_elem[0].getnext()
            langs = langs_elem.xpath(u'text()')[0].strip()
        if langs:
            lng_splt = langs.split(u',')
            if lng_splt:
                displ_lang = lng_splt[0].strip()
                # log.debug(u'displ_lang1: ', displ_lang)
        metadata.language = _translageLanguageToCode(displ_lang)
        # log.debug(u'Language: ', metadata.language)

        # can be set before from xml search response
        if not metadata.pubdate:
            pubdate_elem = doc.xpath(u'//div[contains(text(), "Год выпуска")]')
            if pubdate_elem:
                pubYear = pubdate_elem[0].getnext().xpath(u'text()')[0].strip()
                if pubYear:
                    matcher = re.search(r'\d{4}', pubYear)
                    if matcher:
                        metadata.pubdate = toPubdate(log, matcher.group(0))
        # log.debug(u'Pubdate: ', metadata.pubdate)

        # comments, from Javascript data
        beginning = fullString.find(u'FirstBlock')
        end = fullString.find(u'}', beginning)
        comments = unicode(fullString[beginning + 75:end -
                                      1]).decode("unicode-escape")
        metadata.comments = replace_entities(comments, 'utf-8')
Exemplo n.º 16
0
class Ehentai(Source):

    name = 'E-hentai Galleries'
    author = 'Wu yuan, cssxsh'
    version = (1, 1, 3)
    minimum_calibre_version = (2, 80, 0)

    description = _('Download metadata and cover from e-hentai.org.'
                    'Useful only for doujinshi.')

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'tags', 'rating', 'publisher', 'identifier:ehentai'
    ])
    supports_gzip_transfer_encoding = True
    cached_cover_url_is_reliable = True

    EHentai_url = 'https://e-hentai.org/g/%s/%s/'
    ExHentai_url = 'https://exhentai.org/g/%s/%s/'

    options = (
        Option(
            'Use_Exhentai', 'bool', False, _('Use Exhentai'),
            _('If Use Exhentai is True, the plugin will search metadata on exhentai.'
              )),
        Option('ipb_member_id', 'string', None, _('ipb_member_id'),
               _('If Use Exhentai is True, please input your cookies.')),
        Option('ipb_pass_hash', 'string', None, _('ipb_pass_hash'),
               _('If Use Exhentai is True, please input your cookies.')),
        Option('igneous', 'string', None, _('igneous'),
               _('If Use Exhentai is True, please input your cookies.')),
        Option(
            'Use_Proxy', 'bool', False, _('Use Proxy'),
            _('If Use Proxy is True, the plugin will search metadata by proxy.'
              )),
        Option(
            'link',
            'string',
            None,
            _('link'),  # username:[email protected]:8888
            _('If Use Proxy is True, please input your proxy. example: username:[email protected]:8888'
              )),
    )

    config_help_message = ('<p>' + _(
        'To Download Metadata from exhentai.org you must sign up'
        ' a free account and get the cookies of .exhentai.org.'
        ' If you don\'t have an account, you can <a href="%s">sign up</a>.')
                           ) % 'https://forums.e-hentai.org/index.php'

    def __init__(self, *args, **kwargs):  # {{{
        Source.__init__(self, *args, **kwargs)
        self.config_exhentai()
        self.config_proxy()

    # }}}

    def config_exhentai(self):  # {{{

        ExHentai_Status = self.prefs['Use_Exhentai']
        ExHentai_Cookies = [
            {
                'name': 'ipb_member_id',
                'value': self.prefs['ipb_member_id'],
                'domain': '.exhentai.org',
                'path': '/'
            },
            {
                'name': 'ipb_pass_hash',
                'value': self.prefs['ipb_pass_hash'],
                'domain': '.exhentai.org',
                'path': '/'
            },
            {
                'name': 'igneous',
                'value': self.prefs['igneous'],
                'domain': '.exhentai.org',
                'path': '/'
            },
        ]

        if ExHentai_Status is True:
            for cookie in ExHentai_Cookies:
                if cookie['value'] is None:
                    ExHentai_Status = False
                    break

        self.ExHentai_Status = ExHentai_Status
        self.ExHentai_Cookies = ExHentai_Cookies
        return

    # }}}

    def config_proxy(self):  # {{{

        Proxy_Status = self.prefs['Use_Proxy']
        Proxy = {'https': self.prefs['link'], 'http': self.prefs['link']}
        self.Proxy_Status = Proxy_Status
        self.Proxy = Proxy

    # }}}

    def create_query(self,
                     log,
                     title=None,
                     authors=None,
                     identifiers={},
                     is_exhentai=False):  # {{{

        EHentai_SEARCH_URL = 'https://e-hentai.org/?'
        ExHentai_SEARCH_URL = 'https://exhentai.org/?'

        q = ''

        if title or authors:

            def build_term(type, parts):
                return ' '.join(x for x in parts)

            title_token = list(self.get_title_tokens(title))
            if title_token:
                q = q + build_term('title', title_token)
            author_token = list(
                self.get_author_tokens(authors, only_first_author=True))
            if author_token:
                q = q + (' ' if q != '' else '') + build_term(
                    'author', author_token)
        q = q.strip()
        if isinstance(q, unicode):
            q = q.encode('utf-8')
        if not q:
            return None
        q_dict = {
            'f_doujinshi': 1,
            'f_manga': 1,
            'f_artistcg': 1,
            'f_gamecg': 1,
            'f_western': 1,
            'f_non-h': 1,
            'f_imageset': 1,
            'f_cosplay': 1,
            'f_asianporn': 1,
            'f_misc': 1,
            'f_search': q,
            'f_apply': 'Apply+Filter',
            'advsearch': 1,
            'f_sname': 'on',
            'f_sh': 'on',
            'f_srdd': 2
        }
        if is_exhentai is False:
            url = EHentai_SEARCH_URL + urlencode(q_dict)
        else:
            url = ExHentai_SEARCH_URL + urlencode(q_dict)
        return url

    # }}}

    def get_gallery_info(self, log, raw):  # {{{

        pattern = re.compile(
            r'https:\/\/(?:e-hentai\.org|exhentai\.org)\/g\/(?P<gallery_id>\d+)/(?P<gallery_token>\w+)/'
        )
        results = re.findall(pattern, raw)
        if not results:
            log.exception('Failed to get gallery_id and gallery_token!')
            return None
        gidlist = []
        for r in results:
            gidlist.append(list(r))
        return gidlist

    # }}}

    def get_all_details(self, gidlist, log, abort, result_queue,
                        timeout):  # {{{

        EHentai_API_url = 'https://api.e-hentai.org/api.php'
        ExHentai_API_url = 'https://exhentai.org/api.php'

        is_exhentai = self.ExHentai_Status
        use_proxy = self.Proxy_Status
        proxy = self.Proxy
        url = EHentai_API_url
        br = self.browser
        if is_exhentai is True:
            url = ExHentai_API_url
        if use_proxy is True:

            def proxy_bypass(hostname):
                log(hostname + ' by proxy')
                return True

            br.set_proxies(proxy, proxy_bypass)
        data = {"method": "gdata", "gidlist": gidlist, "namespace": 1}
        data = json.dumps(data)
        try:
            _raw = br.open_novisit(url, timeout=timeout)
            raw = _raw.read()
        except Exception as e:
            log.exception('Failed to make api request.', e)
            return
        gmetadatas = json.loads(raw)['gmetadata']
        for relevance, gmetadata in enumerate(gmetadatas):
            try:
                ans = to_metadata(log, gmetadata, self.ExHentai_Status)
                if isinstance(ans, Metadata):
                    ans.source_relevance = relevance
                    db = ans.identifiers['ehentai']
                    if ans.has_ehentai_cover:
                        self.cache_identifier_to_cover_url(
                            db, ans.has_ehentai_cover)
                    self.clean_downloaded_metadata(ans)
                    result_queue.put(ans)
            except:
                log.exception('Failed to get metadata for identify entry:',
                              gmetadata)
            if abort.is_set():
                break

    # }}}

    def get_book_url(self, identifiers):  # {{{

        db = identifiers.get('ehentai', None)
        d = {'0': False, '1': True}
        if db is not None:
            gid, token, s = re.split('_', db)
            ExHentai_Status = d[str(s)]
            if ExHentai_Status:
                url = self.ExHentai_url % (gid, token)
            else:
                url = self.EHentai_url % (gid, token)
            return ('ehentai', db, url)

    # }}}

    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=None,
                       identifiers={},
                       timeout=30,
                       get_best_cover=False):  # {{{

        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            return
        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            if cdata:
                result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{

        url = None
        db = identifiers.get('ehentai', None)
        if db is None:
            pass
        if db is not None:
            url = self.cached_identifier_to_cover_url(db)
        return url

    # }}}

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=30):  # {{{

        is_exhentai = self.ExHentai_Status
        use_proxy = self.Proxy_Status
        proxy = self.Proxy
        query = self.create_query(log,
                                  title=title,
                                  authors=authors,
                                  identifiers=identifiers,
                                  is_exhentai=is_exhentai)
        if not query:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        if use_proxy is True:

            def proxy_bypass(hostname):
                log(hostname + ' by proxy')
                return True

            br.set_proxies(proxy, proxy_bypass)
        if is_exhentai is True:
            for cookie in self.ExHentai_Cookies:
                br.set_cookie(name=cookie['name'],
                              value=cookie['value'],
                              domain=cookie['domain'],
                              path=cookie['path'])
        try:
            _raw = br.open_novisit(query, timeout=timeout)
            raw = _raw.read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)
        if not raw and identifiers and title and authors and not abort.is_set(
        ):
            return self.identify(log,
                                 result_queue,
                                 abort,
                                 title=title,
                                 authors=authors,
                                 timeout=timeout)
        if is_exhentai is True:
            try:
                'https://exhentai.org/' in raw
            except Exception as e:
                log.error('The cookies for ExHentai is invalid.')
                log.error('Exhentai cookies:')
                log.error(self.ExHentai_Cookies)
                return
        gidlist = self.get_gallery_info(log, raw)
        if not gidlist:
            log.error('No result found.\n', 'query: %s' % query)
            return
        self.get_all_details(gidlist=gidlist,
                             log=log,
                             abort=abort,
                             result_queue=result_queue,
                             timeout=timeout)
Exemplo n.º 17
0
class OverDrive(Source):

    name = 'Overdrive'
    version = (1, 0, 1)
    minimum_calibre_version = (2, 80, 0)
    description = _(
        'Downloads metadata and covers from Overdrive\'s Content Reserve')

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
        'identifier:isbn', 'series', 'series_index', 'languages',
        'identifier:overdrive'
    ])
    has_html_comments = True
    supports_gzip_transfer_encoding = False
    cached_cover_url_is_reliable = True

    options = (Option(
        'get_full_metadata', 'bool', True, _('Download all metadata (slow)'),
        _('Enable this option to gather all metadata available from Overdrive.'
          )), )

    config_help_message = '<p>' + _(
        'Additional metadata can be taken from Overdrive\'s book detail'
        ' page. This includes a limited set of tags used by libraries, comments, language,'
        ' and the e-book ISBN. Collecting this data is disabled by default due to the extra'
        ' time required. Check the download all metadata option below to'
        ' enable downloading this data.')

    def identify(
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,  # {{{
            identifiers={},
            timeout=30):
        ovrdrv_id = identifiers.get('overdrive', None)
        isbn = identifiers.get('isbn', None)

        br = self.browser
        ovrdrv_data = self.to_ovrdrv_data(br, log, title, authors, ovrdrv_id)
        if ovrdrv_data:
            title = ovrdrv_data[8]
            authors = ovrdrv_data[6]
            mi = Metadata(title, authors)
            self.parse_search_results(ovrdrv_data, mi)
            if ovrdrv_id is None:
                ovrdrv_id = ovrdrv_data[7]

            if self.prefs['get_full_metadata']:
                self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log)

            if isbn is not None:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)

            result_queue.put(mi)

        return None

    # }}}

    def download_cover(
            self,
            log,
            result_queue,
            abort,  # {{{
            title=None,
            authors=None,
            identifiers={},
            timeout=30,
            get_best_cover=False):
        import mechanize
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return

        ovrdrv_id = identifiers.get('overdrive', None)
        br = self.browser
        req = mechanize.Request(cached_url)
        if ovrdrv_id is not None:
            referer = self.get_base_referer(
            ) + 'ContentDetails-Cover.htm?ID=' + ovrdrv_id
            req.add_header('referer', referer)

        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(req, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        ovrdrv_id = identifiers.get('overdrive', None)
        if ovrdrv_id is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                ovrdrv_id = self.cached_isbn_to_identifier(isbn)
        if ovrdrv_id is not None:
            url = self.cached_identifier_to_cover_url(ovrdrv_id)

        return url

    # }}}

    def get_base_referer(
            self):  # to be used for passing referrer headers to cover download
        choices = [
            'https://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
            'https://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
            'https://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
            'https://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
            'https://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
        ]
        return choices[random.randint(0, len(choices) - 1)]

    def format_results(self, reserveid, od_title, subtitle, series, publisher,
                       creators, thumbimage, worldcatlink, formatid):
        fix_slashes = re.compile(r'\\/')
        thumbimage = fix_slashes.sub('/', thumbimage)
        worldcatlink = fix_slashes.sub('/', worldcatlink)
        cover_url = re.sub(r'(?P<img>(Ima?g(eType-)?))200', r'\g<img>100',
                           thumbimage)
        social_metadata_url = base_url + 'TitleInfo.aspx?ReserveID=' + reserveid + '&FormatID=' + formatid
        series_num = ''
        if not series:
            if subtitle:
                title = od_title + ': ' + subtitle
            else:
                title = od_title
        else:
            title = od_title
            m = re.search("([0-9]+$)", subtitle)
            if m:
                series_num = float(m.group(1))
        return [
            cover_url, social_metadata_url, worldcatlink, series, series_num,
            publisher, creators, reserveid, title
        ]

    def safe_query(self, br, query_url, post=''):
        '''
        The query must be initialized by loading an empty search results page
        this page attempts to set a cookie that Mechanize doesn't like
        copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
        '''
        import mechanize
        goodcookies = br._ua_handlers['_cookies'].cookiejar
        clean_cj = mechanize.CookieJar()
        cookies_to_copy = []
        for cookie in goodcookies:
            copied_cookie = copy.deepcopy(cookie)
            cookies_to_copy.append(copied_cookie)
        for copied_cookie in cookies_to_copy:
            clean_cj.set_cookie(copied_cookie)

        if post:
            br.open_novisit(query_url, post)
        else:
            br.open_novisit(query_url)

        br.set_cookiejar(clean_cj)

    def overdrive_search(self, br, log, q, title, author):
        import mechanize
        # re-initialize the cookiejar to so that it's clean
        clean_cj = mechanize.CookieJar()
        br.set_cookiejar(clean_cj)
        q_query = q + 'default.aspx/SearchByKeyword'
        q_init_search = q + 'SearchResults.aspx'
        # get first author as string - convert this to a proper cleanup function later
        author_tokens = list(
            self.get_author_tokens(author, only_first_author=True))
        title_tokens = list(
            self.get_title_tokens(title,
                                  strip_joiners=False,
                                  strip_subtitle=True))

        xref_q = ''
        if len(author_tokens) <= 1:
            initial_q = ' '.join(title_tokens)
            xref_q = '+'.join(author_tokens)
        else:
            initial_q = ' '.join(author_tokens)
            for token in title_tokens:
                if len(xref_q) < len(token):
                    xref_q = token

        log.error('Initial query is %s' % initial_q)
        log.error('Cross reference query is %s' % xref_q)

        q_xref = q + 'SearchResults.svc/GetResults?iDisplayLength=50&sSearch=' + xref_q
        query = '{"szKeyword":"' + initial_q + '"}'

        # main query, requires specific Content Type header
        req = mechanize.Request(q_query)
        req.add_header('Content-Type', 'application/json; charset=utf-8')
        br.open_novisit(req, query)

        # initiate the search without messing up the cookiejar
        self.safe_query(br, q_init_search)

        # get the search results object
        results = False
        iterations = 0
        while results is False:
            iterations += 1
            xreq = mechanize.Request(q_xref)
            xreq.add_header('X-Requested-With', 'XMLHttpRequest')
            xreq.add_header('Referer', q_init_search)
            xreq.add_header('Accept', 'application/json, text/javascript, */*')
            raw = br.open_novisit(xreq).read()
            for m in re.finditer(
                    type('')
                (r'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)'
                 ), raw):
                if int(m.group('totalrecords')) == 0:
                    return ''
                elif int(m.group('displayrecords')) >= 1:
                    results = True
                elif int(m.group('totalrecords')) >= 1 and iterations < 3:
                    if xref_q.find('+') != -1:
                        xref_tokens = xref_q.split('+')
                        xref_q = xref_tokens[0]
                        for token in xref_tokens:
                            if len(xref_q) < len(token):
                                xref_q = token
                        # log.error('rewrote xref_q, new query is '+xref_q)
                else:
                    xref_q = ''
                q_xref = q + 'SearchResults.svc/GetResults?iDisplayLength=50&sSearch=' + xref_q

        return self.sort_ovrdrv_results(raw, log, title, title_tokens, author,
                                        author_tokens)

    def sort_ovrdrv_results(self,
                            raw,
                            log,
                            title=None,
                            title_tokens=None,
                            author=None,
                            author_tokens=None,
                            ovrdrv_id=None):
        close_matches = []
        raw = re.sub(r'.*?\[\[(?P<content>.*?)\]\].*', r'[[\g<content>]]', raw)
        results = json.loads(raw)
        # log.error('raw results are:'+type('')(results))
        # The search results are either from a keyword search or a multi-format list from a single ID,
        # sort through the results for closest match/format
        if results:
            for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
                    thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
                    availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
                # log.error("this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series)
                if ovrdrv_id is not None and int(formatid) in [
                        1, 50, 410, 900
                ]:
                    # log.error('overdrive id is not None, searching based on format type priority')
                    return self.format_results(reserveid, od_title, subtitle,
                                               series, publisher, creators,
                                               thumbimage, worldcatlink,
                                               formatid)
                else:
                    if creators:
                        creators = creators.split(', ')

                    # if an exact match in a preferred format occurs
                    if ((author and creators and creators[0] == author[0]) or (not author and not creators)) and  \
                            od_title.lower() == title.lower() and int(formatid) in [1, 50, 410, 900] and thumbimage:
                        return self.format_results(reserveid, od_title,
                                                   subtitle, series, publisher,
                                                   creators, thumbimage,
                                                   worldcatlink, formatid)
                    else:
                        close_title_match = False
                        close_author_match = False
                        for token in title_tokens:
                            if od_title.lower().find(token.lower()) != -1:
                                close_title_match = True
                            else:
                                close_title_match = False
                                break
                        for author in creators:
                            for token in author_tokens:
                                if author.lower().find(token.lower()) != -1:
                                    close_author_match = True
                                else:
                                    close_author_match = False
                                    break
                            if close_author_match:
                                break
                        if close_title_match and close_author_match and int(
                                formatid) in [1, 50, 410, 900] and thumbimage:
                            if subtitle and series:
                                close_matches.insert(
                                    0,
                                    self.format_results(
                                        reserveid, od_title, subtitle, series,
                                        publisher, creators, thumbimage,
                                        worldcatlink, formatid))
                            else:
                                close_matches.append(
                                    self.format_results(
                                        reserveid, od_title, subtitle, series,
                                        publisher, creators, thumbimage,
                                        worldcatlink, formatid))

                        elif close_title_match and close_author_match and int(
                                formatid) in [1, 50, 410, 900]:
                            close_matches.append(
                                self.format_results(reserveid, od_title,
                                                    subtitle, series,
                                                    publisher, creators,
                                                    thumbimage, worldcatlink,
                                                    formatid))

            if close_matches:
                return close_matches[0]
            else:
                return ''
        else:
            return ''

    def overdrive_get_record(self, br, log, q, ovrdrv_id):
        import mechanize
        search_url = q + 'SearchResults.aspx?ReserveID={' + ovrdrv_id + '}'
        results_url = q + 'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'  # noqa

        # re-initialize the cookiejar to so that it's clean
        clean_cj = mechanize.CookieJar()
        br.set_cookiejar(clean_cj)
        # get the base url to set the proper session cookie
        br.open_novisit(q)

        # initialize the search
        self.safe_query(br, search_url)

        # get the results
        req = mechanize.Request(results_url)
        req.add_header('X-Requested-With', 'XMLHttpRequest')
        req.add_header('Referer', search_url)
        req.add_header('Accept', 'application/json, text/javascript, */*')
        raw = br.open_novisit(req)
        raw = type('')(list(raw))
        clean_cj = mechanize.CookieJar()
        br.set_cookiejar(clean_cj)
        return self.sort_ovrdrv_results(raw, log, None, None, None, ovrdrv_id)

    def find_ovrdrv_data(self, br, log, title, author, isbn, ovrdrv_id=None):
        q = base_url
        if ovrdrv_id is None:
            return self.overdrive_search(br, log, q, title, author)
        else:
            return self.overdrive_get_record(br, log, q, ovrdrv_id)

    def to_ovrdrv_data(self, br, log, title=None, author=None, ovrdrv_id=None):
        '''
        Takes either a title/author combo or an Overdrive ID.  One of these
        two must be passed to this function.
        '''
        if ovrdrv_id is not None:
            with cache_lock:
                ans = ovrdrv_data_cache.get(ovrdrv_id, None)
            if ans:
                return ans
            elif ans is False:
                return None
            else:
                ovrdrv_data = self.find_ovrdrv_data(br, log, title, author,
                                                    ovrdrv_id)
        else:
            try:
                ovrdrv_data = self.find_ovrdrv_data(br, log, title, author,
                                                    ovrdrv_id)
            except:
                import traceback
                traceback.print_exc()
                ovrdrv_data = None
        with cache_lock:
            ovrdrv_data_cache[
                ovrdrv_id] = ovrdrv_data if ovrdrv_data else False

        return ovrdrv_data if ovrdrv_data else False

    def parse_search_results(self, ovrdrv_data, mi):
        '''
        Parse the formatted search results from the initial Overdrive query and
        add the values to the metadta.

        The list object has these values:
        [cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4],
        publisher[5], creators[6], reserveid[7], title[8]]

        '''
        ovrdrv_id = ovrdrv_data[7]
        mi.set_identifier('overdrive', ovrdrv_id)

        if len(ovrdrv_data[3]) > 1:
            mi.series = ovrdrv_data[3]
            if ovrdrv_data[4]:
                try:
                    mi.series_index = float(ovrdrv_data[4])
                except:
                    pass
        mi.publisher = ovrdrv_data[5]
        mi.authors = ovrdrv_data[6]
        mi.title = ovrdrv_data[8]
        cover_url = ovrdrv_data[0]
        if cover_url:
            self.cache_identifier_to_cover_url(ovrdrv_id, cover_url)

    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from html5_parser import parse
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             resolve_entities=True)[0]

        try:
            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        except Exception:
            return False

        pub_date = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath(
            "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]"
        )

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {
                'english': 'eng',
                'french': 'fra',
                'german': 'deu',
                'spanish': 'spa'
            }.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print("ebook isbn is "+type('')(ebook_isbn[0]))
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html',
                                 encoding='unicode').strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None
Exemplo n.º 18
0
class Amazon(Source):

    name = 'Amazon.com'
    description = _('Downloads metadata and covers from Amazon')

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset(['title', 'authors', 'identifier:amazon',
        'identifier:isbn', 'rating', 'comments', 'publisher', 'pubdate',
        'languages', 'series'])
    has_html_comments = True
    supports_gzip_transfer_encoding = True

    AMAZON_DOMAINS = {
            'com': _('US'),
            'fr': _('France'),
            'de': _('Germany'),
            'uk': _('UK'),
            'it': _('Italy'),
            'jp': _('Japan'),
            'es': _('Spain'),
            'br': _('Brazil'),
    }

    options = (
            Option('domain', 'choices', 'com', _('Amazon website to use:'),
                _('Metadata from Amazon will be fetched using this '
                    'country\'s Amazon website.'), choices=AMAZON_DOMAINS),
            )

    def __init__(self, *args, **kwargs):
        Source.__init__(self, *args, **kwargs)
        self.set_amazon_id_touched_fields()

    def test_fields(self, mi):
        '''
        Return the first field from self.touched_fields that is null on the
        mi object
        '''
        for key in self.touched_fields:
            if key.startswith('identifier:'):
                key = key.partition(':')[-1]
                if key == 'amazon':
                    if self.domain != 'com':
                        key += '_' + self.domain
                if not mi.has_identifier(key):
                    return 'identifier: ' + key
            elif mi.is_null(key):
                return key

    @property
    def user_agent(self):
        # Pass in an index to random_user_agent() to test with a particular
        # user agent
        return random_user_agent()

    def save_settings(self, *args, **kwargs):
        Source.save_settings(self, *args, **kwargs)
        self.set_amazon_id_touched_fields()

    def set_amazon_id_touched_fields(self):
        ident_name = "identifier:amazon"
        if self.domain != 'com':
            ident_name += '_' + self.domain
        tf = [x for x in self.touched_fields if not
                x.startswith('identifier:amazon')] + [ident_name]
        self.touched_fields = frozenset(tf)

    def get_domain_and_asin(self, identifiers):
        for key, val in identifiers.iteritems():
            key = key.lower()
            if key in ('amazon', 'asin'):
                return 'com', val
            if key.startswith('amazon_'):
                domain = key.split('_')[-1]
                if domain and domain in self.AMAZON_DOMAINS:
                    return domain, val
        return None, None

    def get_book_url(self, identifiers):  # {{{
        domain, asin = self.get_domain_and_asin(identifiers)
        if domain and asin:
            url = None
            if domain == 'com':
                url = 'http://amzn.com/'+asin
            elif domain == 'uk':
                url = 'http://www.amazon.co.uk/dp/'+asin
            elif domain == 'br':
                url = 'http://www.amazon.com.br/dp/'+asin
            else:
                url = 'http://www.amazon.%s/dp/%s'%(domain, asin)
            if url:
                idtype = 'amazon' if domain == 'com' else 'amazon_'+domain
                return (idtype, asin, url)

    def get_book_url_name(self, idtype, idval, url):
        if idtype == 'amazon':
            return self.name
        return 'A' + idtype.replace('_', '.')[1:]
    # }}}

    @property
    def domain(self):
        x = getattr(self, 'testing_domain', None)
        if x is not None:
            return x
        domain = self.prefs['domain']
        if domain not in self.AMAZON_DOMAINS:
            domain = 'com'

        return domain

    def clean_downloaded_metadata(self, mi):
        docase = (
            mi.language == 'eng' or
            (mi.is_null('language') and self.domain in {'com', 'uk'})
        )
        if mi.title and docase:
            mi.title = fixcase(mi.title)
        mi.authors = fixauthors(mi.authors)
        if mi.tags and docase:
            mi.tags = list(map(fixcase, mi.tags))
        mi.isbn = check_isbn(mi.isbn)

    def get_website_domain(self, domain):
        udomain = domain
        if domain == 'uk':
            udomain = 'co.uk'
        elif domain == 'jp':
            udomain = 'co.jp'
        elif domain == 'br':
            udomain = 'com.br'
        return udomain

    def create_query(self, log, title=None, authors=None, identifiers={},  # {{{
            domain=None):
        from urllib import urlencode
        if domain is None:
            domain = self.domain

        idomain, asin = self.get_domain_and_asin(identifiers)
        if idomain is not None:
            domain = idomain

        # See the amazon detailed search page to get all options
        q = {'search-alias': 'aps',
             'unfiltered': '1',
            }

        if domain == 'com':
            q['sort'] = 'relevanceexprank'
        else:
            q['sort'] = 'relevancerank'

        isbn = check_isbn(identifiers.get('isbn', None))

        if asin is not None:
            q['field-keywords'] = asin
        elif isbn is not None:
            q['field-isbn'] = isbn
        else:
            # Only return book results
            q['search-alias'] = 'digital-text' if domain == 'br' else 'stripbooks'
            if title:
                title_tokens = list(self.get_title_tokens(title))
                if title_tokens:
                    q['field-title'] = ' '.join(title_tokens)
            if authors:
                author_tokens = self.get_author_tokens(authors,
                        only_first_author=True)
                if author_tokens:
                    q['field-author'] = ' '.join(author_tokens)

        if not ('field-keywords' in q or 'field-isbn' in q or
                ('field-title' in q)):
            # Insufficient metadata to make an identify query
            return None, None

        # magic parameter to enable Japanese Shift_JIS encoding.
        if domain == 'jp':
            q['__mk_ja_JP'] = u'カタカナ'

        if domain == 'jp':
            encode_to = 'Shift_JIS'
        else:
            encode_to = 'latin1'
        encoded_q = dict([(x.encode(encode_to, 'ignore'), y.encode(encode_to,
            'ignore')) for x, y in
            q.iteritems()])
        url = 'http://www.amazon.%s/s/?'%self.get_website_domain(domain) + urlencode(encoded_q)
        return url, domain

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        domain, asin = self.get_domain_and_asin(identifiers)
        if asin is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                asin = self.cached_isbn_to_identifier(isbn)
        if asin is not None:
            url = self.cached_identifier_to_cover_url(asin)

        return url
    # }}}

    def parse_results_page(self, root, domain):  # {{{
        from lxml.html import tostring

        matches = []

        def title_ok(title):
            title = title.lower()
            bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )']
            if self.domain == 'com':
                bad.extend(['(%s edition)' % x for x in ('spanish', 'german')])
            for x in bad:
                if x in title:
                    return False
            return True

        for div in root.xpath(r'//div[starts-with(@id, "result_")]'):
            links = div.xpath(r'descendant::a[@class="title" and @href]')
            if not links:
                # New amazon markup
                links = div.xpath('descendant::h3/a[@href]')
            for a in links:
                title = tostring(a, method='text', encoding=unicode)
                if title_ok(title):
                    url = a.get('href')
                    if url.startswith('/'):
                        url = 'http://www.amazon.%s%s' % (self.get_website_domain(domain), url)
                    matches.append(url)
                break

        if not matches:
            # This can happen for some user agents that Amazon thinks are
            # mobile/less capable
            for td in root.xpath(
                r'//div[@id="Results"]/descendant::td[starts-with(@id, "search:Td:")]'):
                for a in td.xpath(r'descendant::td[@class="dataColumn"]/descendant::a[@href]/span[@class="srTitle"]/..'):
                    title = tostring(a, method='text', encoding=unicode)
                    if title_ok(title):
                        url = a.get('href')
                        if url.startswith('/'):
                            url = 'http://www.amazon.%s%s' % (self.get_website_domain(domain), url)
                        matches.append(url)
                    break

        # Keep only the top 5 matches as the matches are sorted by relevance by
        # Amazon so lower matches are not likely to be very relevant
        return matches[:5]
    # }}}

    def identify(self, log, result_queue, abort, title=None, authors=None,  # {{{
            identifiers={}, timeout=30):
        '''
        Note this method will retry without identifiers automatically if no
        match is found with identifiers.
        '''
        from calibre.utils.cleantext import clean_ascii_chars
        from calibre.ebooks.chardet import xml_to_unicode
        from lxml.html import tostring
        import html5lib

        testing = getattr(self, 'running_a_test', False)

        query, domain = self.create_query(log, title=title, authors=authors,
                identifiers=identifiers)
        if query is None:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        if testing:
            print ('Using user agent for amazon: %s'%self.user_agent)
        try:
            raw = br.open_novisit(query, timeout=timeout).read().strip()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                log.error('Query malformed: %r'%query)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = _('Amazon timed out. Try again later.')
                log.error(msg)
            else:
                msg = 'Failed to make identify query: %r'%query
                log.exception(msg)
            return as_unicode(msg)

        raw = clean_ascii_chars(xml_to_unicode(raw,
            strip_encoding_pats=True, resolve_entities=True)[0])

        if testing:
            import tempfile
            with tempfile.NamedTemporaryFile(prefix='amazon_results_',
                    suffix='.html', delete=False) as f:
                f.write(raw.encode('utf-8'))
            print ('Downloaded html for results page saved in', f.name)

        matches = []
        found = '<title>404 - ' not in raw

        if found:
            try:
                root = html5lib.parse(raw, treebuilder='lxml',
                        namespaceHTMLElements=False)
            except:
                msg = 'Failed to parse amazon page for query: %r'%query
                log.exception(msg)
                return msg

                errmsg = root.xpath('//*[@id="errorMessage"]')
                if errmsg:
                    msg = tostring(errmsg, method='text', encoding=unicode).strip()
                    log.error(msg)
                    # The error is almost always a not found error
                    found = False

        if found:
            matches = self.parse_results_page(root, domain)

        if abort.is_set():
            return

        if not matches:
            if identifiers and title and authors:
                log('No matches found with identifiers, retrying using only'
                        ' title and authors. Query: %r'%query)
                return self.identify(log, result_queue, abort, title=title,
                        authors=authors, timeout=timeout)
            log.error('No matches found with query: %r'%query)
            return

        workers = [Worker(url, result_queue, br, log, i, domain, self,
                            testing=testing) for i, url in enumerate(matches)]

        for w in workers:
            w.start()
            # Don't send all requests at the same time
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

        return None
    # }}}

    def download_cover(self, log, result_queue, abort,  # {{{
            title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log, rq, abort, title=title, authors=authors,
                    identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
Exemplo n.º 19
0
class Moly_hu(Source):
    name = 'Moly_hu'
    description = _('Downloads metadata and covers from moly.hu')
    author = 'Hoffer Csaba & Kloon & fatsadt & otapi & Dezso'
    version = (1, 0, 9)
    minimum_calibre_version = (0, 8, 0)

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'identifier:isbn', 'identifier:moly_hu', 'tags',
        'comments', 'rating', 'series', 'series_index', 'publisher', 'pubdate',
        'language', 'languages'
    ])
    has_html_comments = False
    supports_gzip_transfer_encoding = False
    can_get_multiple_covers = True

    KEY_MAX_BOOKS = 'max_books'
    KEY_MAX_COVERS = 'max_covers'

    options = (Option(
        KEY_MAX_BOOKS, 'number', 3, _('Maximum number of books to get'),
        _('The maximum number of books to process from the moly.hu search result'
          )),
               Option(
                   KEY_MAX_COVERS, 'number', 5,
                   _('Maximum number of covers to get'),
                   _('The maximum number of covers to process for the chosen book'
                     )))

    BASE_URL = 'https://moly.hu'
    BOOK_URL = BASE_URL + '/konyvek/'
    SEARCH_URL = BASE_URL + '/kereses?utf8=%E2%9C%93&q='

    def create_query(self, log, title=None, authors=None, identifiers={}):
        isbn = check_isbn(identifiers.get('isbn', None))
        if isbn is not None:
            return Moly_hu.SEARCH_URL + isbn
        if title is not None:
            search_title = quote(title.encode('utf-8'))
        else:
            search_title = ''

        if authors is not None:
            search_author = quote(authors[0].encode('utf-8'))
        else:
            search_author = ''

        search_page = Moly_hu.SEARCH_URL + \
            '%s+%s' % (search_author, search_title)

        return search_page

    def get_cached_cover_url(self, identifiers):
        url = None
        moly_id = identifiers.get('moly_hu', None)
        if moly_id is None:
            isbn = check_isbn(identifiers.get('isbn', None))
            if isbn is not None:
                moly_id = self.cached_isbn_to_identifier(isbn)
        if moly_id is not None:
            url = self.cached_identifier_to_cover_url(moly_id)
        return url

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title,
                 authors,
                 identifiers={},
                 timeout=30):
        '''
        Note this method will retry without identifiers automatically if no
        match is found with identifiers.
        '''
        matches = []
        moly_id = identifiers.get('moly_hu', None)
        log.info(u'\nTitle:%s\nAuthors:%s\n' % (title, authors))
        br = browser()
        if moly_id:
            matches.append(Moly_hu.BOOK_URL + moly_id)
        else:
            query = self.create_query(log,
                                      title=title,
                                      authors=authors,
                                      identifiers=identifiers)
            if query is None:
                log.error('Insufficient metadata to construct query')
                return
            try:
                log.info('Querying: %s' % query)
                response = br.open(query)
            except Exception as e:
                if callable(getattr(e, 'getcode',
                                    None)) and e.getcode() == 404:
                    log.info('Failed to find match for ISBN: %s' % isbn)
                else:
                    err = 'Failed to make identify query: %r' % query
                    log.exception(err)
                    return as_unicode(e)

            try:
                raw = response.read().strip()
                raw = raw.decode('utf-8', errors='replace')
                if not raw:
                    log.error('Failed to get raw result for query: %r' % query)
                    return
                root = fromstring(clean_ascii_chars(raw))
            except:
                msg = 'Failed to parse moly.hu page for query: %r' % query
                log.exception(msg)
                return msg
            isbn = check_isbn(identifiers.get('isbn', None))
            self._parse_search_results(log, title, authors, root, matches,
                                       timeout, isbn)

        if abort.is_set():
            return

        if not matches:
            log.error('No matches found with query: %r' % query)
            if identifiers and title and authors:
                log.info(
                    'No matches found with identifiers, retrying using only'
                    ' title and authors')
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=title,
                                     authors=authors,
                                     timeout=timeout)
            elif title and authors and title != title.split("(")[0]:
                log.info(
                    'No matches found with authors and title try removing () part from title, and search by title and author'
                )
                tit = title.split("(")[0]
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=tit,
                                     authors=authors,
                                     timeout=timeout)
            elif title and authors:
                log.info(
                    'No matches found with authors and title, retrying using only title'
                )
                return self.identify(log,
                                     result_queue,
                                     abort,
                                     title=title,
                                     authors=None,
                                     timeout=timeout)
            return

        from calibre_plugins.moly_hu.worker import Worker
        workers = [
            Worker(url, result_queue, br, log, i, self)
            for i, url in enumerate(matches)
        ]

        for w in workers:
            w.start()
            time.sleep(0.1)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

        return None

    def _parse_search_results(self, log, orig_title, orig_authors, root,
                              matches, timeout, isbn):
        max_results = self.prefs[Moly_hu.KEY_MAX_BOOKS]
        results = root.xpath('//a[@class="book_selector"]')
        log.info('Found %d possible books (max: %d)' %
                 (len(results), max_results))
        i = 0
        for result in results:
            book_urls = result.xpath('@href')

            if isbn is None:
                etree.strip_tags(result, 'strong')
                author_n_title = result.text
                author_n_titles = author_n_title.split(':', 1)
                author = author_n_titles[0].strip(' \r\n\t')
                title = author_n_titles[1].strip(' \r\n\t')
                log.info('Orig: %s, target: %s' %
                         (self.strip_accents(orig_title),
                          self.strip_accents(title)))

                if orig_title:
                    if orig_title.lower() not in title.lower(
                    ) and self.strip_accents(
                            orig_title) not in self.strip_accents(title):
                        continue
                if orig_authors:
                    author1 = orig_authors[0]
                    authorsplit = author1.split(" ")
                    author2 = author1
                    if len(authorsplit) > 1:
                        author2 = '%s %s' % (authorsplit[1], authorsplit[0])
                    if author1.lower() not in author.lower(
                    ) and self.strip_accents(
                            author1) not in self.strip_accents(
                                author) and author2.lower(
                                ) not in author.lower() and self.strip_accents(
                                    author2) not in self.strip_accents(author):
                        continue

            for book_url in book_urls:
                result_url = Moly_hu.BASE_URL + book_url

                if (result_url not in matches):
                    matches.append(result_url)
                    i += 1
                if (i >= max_results):
                    return
        if i == 0:
            for result in results:
                book_urls = result.xpath('@href')
                for book_url in book_urls:
                    result_url = Moly_hu.BASE_URL + book_url
                    if (result_url not in matches):
                        matches.append(result_url)
                        i += 1
                    if (i >= max_results):
                        return

    def strip_accents(self, s):
        if s is None:
            return None
        symbols = (u"öÖüÜóÓőŐúÚéÉáÁűŰíÍ", u"oOuUoOoOuUeEaAuUiI")

        tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)])

        return s.translate(tr).lower()

    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=None,
                       identifiers={},
                       timeout=30,
                       get_best_cover=False):
        if not title:
            return
        urls = self.get_image_urls(title, authors, identifiers, log, abort,
                                   timeout)
        self.download_multiple_covers(title, authors, urls, get_best_cover,
                                      timeout, result_queue, abort, log)

    def get_image_urls(self, title, authors, identifiers, log, abort, timeout):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break

        if cached_url is not None:
            return cached_url

        log.info('No cover found')
        return []
Exemplo n.º 20
0
class GoogleImages(Source):

    name = 'Google Images'
    version = (1, 0, 0)
    minimum_calibre_version = (2, 80, 0)
    description = _(
        'Downloads covers from a Google Image search. Useful to find larger/alternate covers.'
    )
    capabilities = frozenset(['cover'])
    can_get_multiple_covers = True
    supports_gzip_transfer_encoding = True
    options = (
        Option(
            'max_covers', 'number', 5, _('Maximum number of covers to get'),
            _('The maximum number of covers to process from the Google search result'
              )),
        Option('size',
               'choices',
               'svga',
               _('Cover size'),
               _('Search for covers larger than the specified size'),
               choices=OrderedDict((
                   (
                       'any',
                       _('Any size'),
                   ),
                   (
                       'l',
                       _('Large'),
                   ),
                   (
                       'qsvga',
                       _('Larger than %s') % '400x300',
                   ),
                   (
                       'vga',
                       _('Larger than %s') % '640x480',
                   ),
                   (
                       'svga',
                       _('Larger than %s') % '600x800',
                   ),
                   (
                       'xga',
                       _('Larger than %s') % '1024x768',
                   ),
                   (
                       '2mp',
                       _('Larger than %s') % '2 MP',
                   ),
                   (
                       '4mp',
                       _('Larger than %s') % '4 MP',
                   ),
               ))),
    )

    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=None,
                       identifiers={},
                       timeout=30,
                       get_best_cover=False):
        if not title:
            return
        timeout = max(60, timeout)  # Needs at least a minute
        title = ' '.join(self.get_title_tokens(title))
        author = ' '.join(self.get_author_tokens(authors))
        urls = self.get_image_urls(title, author, log, abort, timeout)
        self.download_multiple_covers(title, authors, urls, get_best_cover,
                                      timeout, result_queue, abort, log)

    @property
    def user_agent(self):
        return USER_AGENT

    def get_image_urls(self, title, author, log, abort, timeout):
        from calibre.utils.cleantext import clean_ascii_chars
        from urllib import urlencode
        import html5lib
        import json
        from collections import OrderedDict
        ans = OrderedDict()
        br = self.browser
        q = urlencode({
            'as_q': ('%s %s' % (title, author)).encode('utf-8')
        }).decode('utf-8')
        sz = self.prefs['size']
        if sz == 'any':
            sz = ''
        elif sz == 'l':
            sz = 'isz:l,'
        else:
            sz = 'isz:lt,islt:%s,' % sz
        # See https://www.google.com/advanced_image_search to understand this
        # URL scheme
        url = 'https://www.google.com/search?as_st=y&tbm=isch&{}&as_epq=&as_oq=&as_eq=&cr=&as_sitesearch=&safe=images&tbs={}iar:t,ift:jpg'.format(
            q, sz)
        log('Search URL: ' + url)
        raw = br.open(url).read().decode('utf-8')
        root = html5lib.parse(clean_ascii_chars(raw),
                              treebuilder='lxml',
                              namespaceHTMLElements=False)
        for div in root.xpath('//div[@class="rg_meta"]'):
            try:
                data = json.loads(div.text)
            except Exception:
                continue
            if 'ou' in data:
                ans[data['ou']] = True
        return list(ans.iterkeys())
Exemplo n.º 21
0
class OverDrive(Source):

    name = 'Overdrive'
    version = (1, 0, 0)
    minimum_calibre_version = (2, 80, 0)
    description = _(
        'Downloads metadata and covers from Overdrive\'s Content Reserve')

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
        'identifier:isbn', 'series', 'series_index', 'languages',
        'identifier:overdrive'
    ])
    has_html_comments = True
    supports_gzip_transfer_encoding = False
    cached_cover_url_is_reliable = True

    options = (Option(
        'get_full_metadata', 'bool', True, _('Download all metadata (slow)'),
        _('Enable this option to gather all metadata available from Overdrive.'
          )), )

    config_help_message = '<p>' + _(
        'Additional metadata can be taken from Overdrive\'s book detail'
        ' page. This includes a limited set of tags used by libraries, comments, language,'
        ' and the e-book ISBN. Collecting this data is disabled by default due to the extra'
        ' time required. Check the download all metadata option below to'
        ' enable downloading this data.')

    def identify(
            self,
            log,
            result_queue,
            abort,
            title=None,
            authors=None,  # {{{
            identifiers={},
            timeout=30):
        ovrdrv_id = identifiers.get('overdrive', None)
        isbn = identifiers.get('isbn', None)

        br = self.browser
        ovrdrv_data = self.to_ovrdrv_data(br, log, title, authors, ovrdrv_id)
        if ovrdrv_data:
            title = ovrdrv_data[8]
            authors = ovrdrv_data[6]
            mi = Metadata(title, authors)
            self.parse_search_results(ovrdrv_data, mi)
            if ovrdrv_id is None:
                ovrdrv_id = ovrdrv_data[7]

            if self.prefs['get_full_metadata']:
                self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log)

            if isbn is not None:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)

            result_queue.put(mi)

        return None

    # }}}

    def download_cover(
            self,
            log,
            result_queue,
            abort,  # {{{
            title=None,
            authors=None,
            identifiers={},
            timeout=30,
            get_best_cover=False):
        import mechanize
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return

        ovrdrv_id = identifiers.get('overdrive', None)
        br = self.browser
        req = mechanize.Request(cached_url)
        if ovrdrv_id is not None:
            referer = self.get_base_referer(
            ) + 'ContentDetails-Cover.htm?ID=' + ovrdrv_id
            req.add_header('referer', referer)

        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(req, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        ovrdrv_id = identifiers.get('overdrive', None)
        if ovrdrv_id is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                ovrdrv_id = self.cached_isbn_to_identifier(isbn)
        if ovrdrv_id is not None:
            url = self.cached_identifier_to_cover_url(ovrdrv_id)

        return url

    # }}}

    def get_base_referer(
            self):  # to be used for passing referrer headers to cover download
        choices = [
            'https://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
            'https://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
            'https://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
            'https://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
            'https://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
        ]
        return choices[random.randint(0, len(choices) - 1)]

    def format_results(self, reserveid, od_title, subtitle, series, publisher,
                       creators, thumbimage, worldcatlink, formatid):
        fix_slashes = re.compile(r'\\/')
        thumbimage = fix_slashes.sub('/', thumbimage)
        worldcatlink = fix_slashes.sub('/', worldcatlink)
        cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100',
                           thumbimage)
        social_metadata_url = base_url + 'TitleInfo.aspx?ReserveID=' + reserveid + '&FormatID=' + formatid
        series_num = ''
        if not series:
            if subtitle:
                title = od_title + ': ' + subtitle
            else:
                title = od_title
        else:
            title = od_title
            m = re.search("([0-9]+$)", subtitle)
            if m:
                series_num = float(m.group(1))
        return [
            cover_url, social_metadata_url, worldcatlink, series, series_num,
            publisher, creators, reserveid, title
        ]

    def safe_query(self, br, query_url, post=''):
        '''
        The query must be initialized by loading an empty search results page
        this page attempts to set a cookie that Mechanize doesn't like
        copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
        '''
        import mechanize
        goodcookies = br._ua_handlers['_cookies'].cookiejar
        clean_cj = mechanize.CookieJar()
        cookies_to_copy = []
        for cookie in goodcookies:
            copied_cookie = copy.deepcopy(cookie)
            cookies_to_copy.append(copied_cookie)
        for copied_cookie in cookies_to_copy:
            clean_cj.set_cookie(copied_cookie)

        if post:
            br.open_novisit(query_url, post)
        else:
            br.open_novisit(query_url)

        br.set_cookiejar(clean_cj)

    def overdrive_search(self, br, log, q, title, author):
        import mechanize
        # re-initialize the cookiejar to so that it's clean
        clean_cj = mechanize.CookieJar()
        br.set_cookiejar(clean_cj)
        q_query = q + 'default.aspx/SearchByKeyword'
        q_init_search = q + 'SearchResults.aspx'
        # get first author as string - convert this to a proper cleanup function later
        author_tokens = list(
            self.get_author_tokens(author, only_first_author=True))
        title_tokens = list(
            self.get_title_tokens(title,
                                  strip_joiners=False,
                                  strip_subtitle=True))

        xref_q = ''
        if len(author_tokens) <= 1:
            initial_q = ' '.join(title_tokens)
            xref_q = '+'.join(author_tokens)
        else:
            initial_q = ' '.join(author_tokens)
            for token in title_tokens:
                if len(xref_q) < len(token):
                    xref_q = token

        log.error('Initial query is %s' % initial_q)
        log.error('Cross reference query is %s' % xref_q)

        q_xref = q + 'SearchResults.svc/GetResults?iDisplayLength=50&sSearch=' + xref_q
        query = '{"szKeyword":"' + initial_q + '"}'

        # main query, requires specific Content Type header
        req = mechanize.Request(q_query)
        req.add_header('Content-Type', 'application/json; charset=utf-8')
        br.open_novisit(req, query)

        # initiate the search without messing up the cookiejar
        self.safe_query(br, q_init_search)

        # get the search results object
        results = False
        iterations = 0
        while results is False:
            iterations += 1
            xreq = mechanize.Request(q_xref)
            xreq.add_header('X-Requested-With', 'XMLHttpRequest')
            xreq.add_header('Referer', q_init_search)
            xreq.add_header('Accept', 'application/json, text/javascript, */*')
            raw = br.open_novisit(xreq).read()
            for m in re.finditer(
                    ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)',
                    raw):
                if int(m.group('totalrecords')) == 0:
                    return ''
                elif int(m.group('displayrecords')) >= 1:
                    results = True
                elif int(m.group('totalrecords')) >= 1 and iterations < 3:
                    if xref_q.find('+') != -1:
                        xref_tokens = xref_q.split('+')
                        xref_q = xref_tokens[0]
                        for token in xref_tokens:
                            if len(xref_q) < len(token):
                                xref_q = token
                        # log.error('rewrote xref_q, new query is '+xref_q)
                else:
                    xref_q = ''
                q_xref = q + 'SearchResults.svc/GetResults?iDisplayLength=50&sSearch=' + xref_q

        return self.sort_ovrdrv_results(raw, log, title, title_tokens, author,
                                        author_tokens)
Exemplo n.º 22
0
class Douban(Source):

    name = 'Douban Books'
    author = 'Li Fanxi, xcffl, jnozsc'
    version = (3, 1, 0)
    minimum_calibre_version = (2, 80, 0)

    description = _(
        'Downloads metadata and covers from Douban.com. '
        'Useful only for Chinese language books.'
    )

    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'tags', 'pubdate', 'comments', 'publisher',
        'identifier:isbn', 'rating', 'identifier:douban'
    ])  # language currently disabled
    supports_gzip_transfer_encoding = True
    cached_cover_url_is_reliable = True

    DOUBAN_API_KEY = '0df993c66c0c636e29ecbb5344252a4a'
    DOUBAN_API_URL = 'https://api.douban.com/v2/book/search'
    DOUBAN_BOOK_URL = 'https://book.douban.com/subject/%s/'

    options = (
        Option(
            'include_subtitle_in_title', 'bool', True,
            _('Include subtitle in book title:'),
            _('Whether to append subtitle in the book title.')
        ),
    )

    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        douban_id = entry_.get('id')
        title = entry_.get('title')
        description = entry_.get('summary')
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get('publisher')
        isbn = entry_.get('isbn13')  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get('pubdate')
        authors = entry_.get('author')
        book_tags = entry_.get('tags')
        rating = entry_.get('rating')
        cover_url = entry_.get('images', {}).get('large')
        series = entry_.get('series')

        if not authors:
            authors = [_('Unknown')]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {'douban': douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(''), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = [tag['name'] for tag in book_tags]

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        # Ratings
        if rating:
            try:
                mi.rating = float(rating['average']) / 2.0
            except:
                log.exception('Failed to parse rating')
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find('book-default') == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series['title']

        return mi

    # }}}

    def get_book_url(self, identifiers):  # {{{
        db = identifiers.get('douban', None)
        if db is not None:
            return ('douban', db, self.DOUBAN_BOOK_URL % db)

    # }}}

    def create_query(self, log, title=None, authors=None, identifiers={}):  # {{{
        try:
            from urllib.parse import urlencode
        except ImportError:
            from urllib import urlencode
        SEARCH_URL = 'https://api.douban.com/v2/book/search?count=10&'
        ISBN_URL = 'https://api.douban.com/v2/book/isbn/'
        SUBJECT_URL = 'https://api.douban.com/v2/book/'

        q = ''
        t = None
        isbn = check_isbn(identifiers.get('isbn', None))
        subject = identifiers.get('douban', None)
        if isbn is not None:
            q = isbn
            t = 'isbn'
        elif subject is not None:
            q = subject
            t = 'subject'
        elif title or authors:

            def build_term(prefix, parts):
                return ' '.join(x for x in parts)

            title_tokens = list(self.get_title_tokens(title))
            if title_tokens:
                q += build_term('title', title_tokens)
            author_tokens = list(
                self.get_author_tokens(authors, only_first_author=True)
            )
            if author_tokens:
                q += ((' ' if q != '' else '') + build_term('author', author_tokens))
            t = 'search'
        q = q.strip()
        if isinstance(q, type(u'')):
            q = q.encode('utf-8')
        if not q:
            return None
        url = None
        if t == "isbn":
            url = ISBN_URL + q
        elif t == 'subject':
            url = SUBJECT_URL + q
        else:
            url = SEARCH_URL + urlencode({
                'q': q,
            })
        if self.DOUBAN_API_KEY and self.DOUBAN_API_KEY != '':
            if t == "isbn" or t == "subject":
                url = url + "?apikey=" + self.DOUBAN_API_KEY
            else:
                url = url + "&apikey=" + self.DOUBAN_API_KEY
        return url

    # }}}

    def download_cover(
        self,
        log,
        result_queue,
        abort,  # {{{
        title=None,
        authors=None,
        identifiers={},
        timeout=30,
        get_best_cover=False
    ):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(
                log,
                rq,
                abort,
                title=title,
                authors=authors,
                identifiers=identifiers
            )
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(
                key=self.identify_results_keygen(
                    title=title, authors=authors, identifiers=identifiers
                )
            )
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return
        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            if cdata:
                result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)

    # }}}

    def get_cached_cover_url(self, identifiers):  # {{{
        url = None
        db = identifiers.get('douban', None)
        if db is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                db = self.cached_isbn_to_identifier(isbn)
        if db is not None:
            url = self.cached_identifier_to_cover_url(db)

        return url

    # }}}

    def get_all_details(
        self,
        br,
        log,
        entries,
        abort,  # {{{
        result_queue,
        timeout
    ):
        for relevance, i in enumerate(entries):
            try:
                ans = self.to_metadata(br, log, i, timeout)
                if isinstance(ans, Metadata):
                    ans.source_relevance = relevance
                    db = ans.identifiers['douban']
                    for isbn in getattr(ans, 'all_isbns', []):
                        self.cache_isbn_to_identifier(isbn, db)
                    if ans.has_douban_cover:
                        self.cache_identifier_to_cover_url(db, ans.has_douban_cover)
                    self.clean_downloaded_metadata(ans)
                    result_queue.put(ans)
            except:
                log.exception('Failed to get metadata for identify entry:', i)
            if abort.is_set():
                break

    # }}}

    def identify(
        self,
        log,
        result_queue,
        abort,
        title=None,
        authors=None,  # {{{
        identifiers={},
        timeout=30
    ):
        import json

        query = self.create_query(
            log, title=title, authors=authors, identifiers=identifiers
        )
        if not query:
            log.error('Insufficient metadata to construct query')
            return
        br = self.browser
        try:
            raw = br.open_novisit(query, timeout=timeout).read()
        except Exception as e:
            log.exception('Failed to make identify query: %r' % query)
            return as_unicode(e)
        try:
            j = json.loads(raw)
        except Exception as e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
        if 'books' in j:
            entries = j['books']
        else:
            entries = []
            entries.append(j)
        if not entries and identifiers and title and authors and \
                not abort.is_set():
            return self.identify(
                log,
                result_queue,
                abort,
                title=title,
                authors=authors,
                timeout=timeout
            )
        # There is no point running these queries in threads as douban
        # throttles requests returning 403 Forbidden errors
        self.get_all_details(br, log, entries, abort, result_queue, timeout)

        return None
Exemplo n.º 23
0
class noosfere(Source):
    # see https://manual.calibre-ebook.com/fr/plugins.html#calibre.ebooks.metadata.sources.base.Source
    # and https://manual.calibre-ebook.com/fr/_modules/calibre/ebooks/metadata/sources/base.html#Source

    name = 'noosfere DB'
    description = _(
        'Source extention: downloads and sets metadata from noosfere.org for selected volumes'
    )
    author = 'Louis Richard Pirlet'
    version = (0, 9, 0)
    minimum_calibre_version = (5, 11, 0)

    ID_NAME = 'noosfere'
    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset([
        'title', 'authors', 'identifier:isbn', 'identifier:nsfr_id',
        'languages', 'comments', 'publisher', 'pubdate', 'series', 'tags'
    ])
    has_html_comments = True
    supports_gzip_transfer_encoding = True

    # Since the noosfere is written in french for french talking poeple, I
    # took the liberty to write the following information in french. I will
    # comment with a translation in the english language.

    #config help message: noosfere is a database that presents information
    #about French books, tagged as science-fiction. Those informations span
    #from author to films made of the books, including translators,
    #illustrators, critics... and of course there work. The book that were
    #published several time are exposed as a "volume". Each of those volumes
    #share the authors and the book content, they MAY share, or not, the
    #cover, the editor, the editor's collection and the associated order
    #number, the resume, the critics,etc.... The choice of the volume is done
    #by the program. One may somewhat influence the choice through the dialog
    #box `priorité de tri´. On the other hand, there is no offical way to
    #programmaticaly update a custom column. So There is a tick box that will
    #push the information along with the publisher. Please read the doc to
    #understand how to put it back later in the right place with a right format.

    config_help_message = '<p>' + _(
        " noosfere est une base de donnée qui propose des informations"
        " à propos des ouvrages, de genre science fiction, disponibles en langue française."
        " Ces informations vont de l'auteur aux films produits sur base de l'ouvrage en"
        " passant par les auteurs, les traducteurs, les illustrateurs, les critiquess..."
        " et bien sur, leurs oeuvres. Les livres qui ont été publiés plusieurs fois"
        " sont repris chacun sous un volume dont est exposé l'ISBN, la date de dépot legal"
        " (repris sous la date de publication, souvent méconnue), la couverture, l'éditeur,"
        " la collection de l'editeur et son numèro d'ordre. Le choix, programmé, du volume"
        " est quelque peu paramétrable par la boite de dialogue `priorité de tri´. "
        " D'autre part, il n'existe pas de moyens officiels de remplir une colonne définie"
        " par l'utilisateur. Pour rester dans les clous, je propose de remplir le champs"
        " de l'editeur avec, conjointement à celui-ci, la collection et son numero d'ordre."
        " Une petite procédure, décrite dans la doc devrait remettre tout en ordre."
    )

    # priority handling, a choice box that propose to set the priority over
    # the oldest published volume with a preference for an ISBN balanced for a maximum of comments
    # the latest published volume with a preference for an ISBN balanced for a maximum of comments
    # the oldest balanced for a maximum of comments
    # the latest balanced for a maximum of comments
    # the very oldest
    # the very latest
    # note that the selected volume will have the most represented editor
    # (if editor x reedited 4 time the book, and editor Y only once,
    # editor x will certainly be selected)
    # see algorithm explanation in worker.py 'ret_top_vol_indx(self, url, book_title)'

    PRIORITY_HANDLING = {
        '0_oldest_with_isbn': _("le plus ancien pondéré, préfère un isbn"),
        '1_latest_with_isbn': _("le plus récent pondéré, préfère un isbn"),
        '2_oldest': _("un plus ancien pondéré"),
        '3_latest': _("un plus recent pondéré"),
        '4_very_oldest': _("vraiment le plus ancien"),
        '5_very_latest': _("vraiment le plus recent")
    }

    options = (
        Option(
            'fat_publisher',
            'bool',
            False,
            _(
                "Ajoute collection et son numéro d'ordre au champ èditeur"
            ),  # add the editor's collection and the associated order number to the publisher field
            _("Cochez cette case pour ajouter la collection et son numéro d'ordre au champs de l'éditeur."
              "Voir LIS-MOI editeur_collection_seriel-code.txt"
              )  # check this box to enable... see README publisher_collection_seriel-code.txt
        ),
        Option(
            'debug_level',
            'number',
            0,
            _('Loquacité du journal, de 0 à 7'),  # verbosity of the log
            _('Le niveau de loquacité. O un minimum de rapport, 1 rapport etendu de __init__,'  # the level of verbosity. value 0 will output the minimum,
              ' 2 rapport étendu de worker, 4 rapport etendu des annexes... La somme 3, 5 ou 7'  # 1 debug messages of __init__, 2 debug messages of worker
              ' peut etre introduite. Ainsi 7 donne un maximun de rapport. Note: ce sont les 3'  # 4 debug level of accessory code... 3, 5 or 7 is the sum
              ' derniers bits de debug_level en notation binaire'
              )  # of the value defined above. In fact it is a bitwise flag
        ),  # spread over the last 3 bits of debug_level
        Option(
            'priority_handling',
            'choices',
            '0_oldest_with_isbn',
            _('priorité de tri:'),
            _("Priorité de tri du volume."
              ),  # how to push the priority over the choice of the volume
            choices=PRIORITY_HANDLING),
        Option(
            'requested_editor',
            'string',
            None,
            _("impose un éditeur"),  # impose a publisher
            _("le volume sera choisi chez l'éditeur le plus representé... SAUF:"  # the volume is picked-up from the most prevalent publisher
              " Remplir ce champ pour forcer un éditeur defini... DOIT"  # EXCEPTED: fill this field to force the publisher wanted
              " ETRE UN MATCH PARFAIT sinon le volume sera choisi sans tenir compte"  # MUST BE A PERFECT MATCH else the volume will ne picked-up
              " de l'éditeur.")  # without consideration to the publisher
        ),
    )

    # this defines a method to access both the code and the data in the object
    @property
    def priority_handling(self):
        x = getattr(self, 'prio_handling', None)
        if x is not None:
            return x
        prio_handling = self.prefs['priority_handling']
        if prio_handling not in self.PRIORITY_HANDLING:
            prio_handling = sorted(
                self.PRIORITY_HANDLING.items()
            )[0]  # sort the dict to make a list and select first item (that should be the default)
        return prio_handling

    @property
    def extended_publisher(self):
        x = getattr(self, 'ext_pub', None)
        if x is not None:
            return x
        ext_pub = self.prefs.get('fat_publisher', False)
        return ext_pub

    @property
    def dbg_lvl(self):
        x = getattr(self, 'dl', None)
        if x is not None:
            return x
        dl = self.prefs.get('debug_level', False)
        return dl

    @property
    def must_be_editor(self):
        x = getattr(self, 'te', None)
        if x is not None:
            return x
        te = self.prefs.get('requested_editor', None)
        return te

    # copied from other working metadata source (thanks to David Forrester and the Kobo Books Metadata source)
    def get_cached_cover_url(self, identifiers):
        # I guess this routine returns an url that was discovered somewhere else and put into cache
        # probably using cache_identifier_to_cover_url in the worket.py
        # as ISBN is missing sometime in noosfere
        # as noosfere does not provide any proprietary id
        # I will use nsfr_id, a combination of bk_<significant part of book_url>_vl_<significant part of vol_url>
        # this should allow to go directly to the book page (that could be the vol page if there is only one vol for the book)
        #
        url = None
        nsfr_id = identifiers.get('nsfr_id', None)
        if nsfr_id is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                nsfr_id = self.cached_isbn_to_identifier(isbn)
        if nsfr_id is not None:
            url = self.cached_identifier_to_cover_url(nsfr_id)
        return url

    def ret_author_index(self, log, br, authors):
        # Trouve la reference de l'auteur dans la soupe de noosfere
        # retourne author_index, un dictionnaire avec key=AUTEUR, val=href
        # L'idée est de renvoyer UNE seule reference... trouver l'auteur est primordial si isbn is indisponible
        #
        # Find author references in the soup produced by noosfere, return author_index a dictionary with key=author, val=href
        # the idea is to find ONE single reference... to get the author is important if isbn is unavailable
        #
        debug = self.dbg_lvl & 1
        log.info("\nIn ret_author_index(soup)")
        if debug:
            log.info("authors    : ", authors)
        all_author_index = {}
        author_index = []

        # try to get a short list of authors using "MOTS-CLEFS" match

        for j in range(len(authors)):
            rkt = {
                "Mots": authors[j],
                "auteurs": "auteurs",
                "ModeMoteur": "MOTS-CLEFS",
                "ModeRecherche": "AND",
                "recherche": "1",
                "Envoyer": "Envoyer"
            }
            url = "https://www.noosfere.org/livres/noosearch.asp"
            soup = ret_soup(log, self.dbg_lvl, br, url, rkt=rkt)[0]
            tmp_ai = soup.select('a[href*="auteur.asp"]')
            if len(tmp_ai):
                for i in range(len(tmp_ai)):
                    url_author, author, perta = tmp_ai[i]["href"], tmp_ai[
                        i].text, tmp_ai[i].find_previous('tr').select_one(
                            'td').text
                    ratio = SM(
                        None,
                        ret_clean_text(log, self.dbg_lvl, author, swap=True),
                        authors[j]).ratio()
                    if debug:
                        log.info("pertinence : ", perta, end=" ; ")
                        log.info("SM.ratio : {:.3f}".format(ratio), end=" ; ")
                        log.info("url_author : ", url_author, end=" ; ")
                        log.info("authors[j] : ", authors[j], end=" ; ")
                        log.info("author : ",
                                 ret_clean_text(log, self.dbg_lvl, author))
                    if ratio >= .6:
                        all_author_index[url_author] = [ratio, author]

            if not len(
                    all_author_index
            ):  # failed the short list, let's go for the long list using "LITTERAL" match
                if debug: log.info("exact match failed, trying fuzzy match")
                # return self.ret_author_index(self, log, br, authors, ModeMoteur="LITTERAL")
                # ca marche pas... ret_author_index() got multiple values for argument 'ModeMoteur'
                # this is NOT a function but a class method
                # it is possible to move the common part of this code below, but my mind refuses to understand the change
                # when debugging... so duplicate the code (maybe an optimiseur later will make it... m'en fout)
                for j in range(len(authors)):
                    rkt = {
                        "Mots": authors[j],
                        "auteurs": "auteurs",
                        "ModeMoteur": "LITTERAL",
                        "ModeRecherche": "AND",
                        "recherche": "1",
                        "Envoyer": "Envoyer"
                    }
                    url = "https://www.noosfere.org/livres/noosearch.asp"
                    soup = ret_soup(log, self.dbg_lvl, br, url, rkt=rkt)[0]
                    tmp_ai = soup.select('a[href*="auteur.asp"]')
                    if len(tmp_ai):
                        for i in range(len(tmp_ai)):
                            url_author, author, perta = tmp_ai[i][
                                "href"], tmp_ai[i].text, tmp_ai[
                                    i].find_previous('tr').select_one(
                                        'td').text
                            ratio = SM(
                                None,
                                ret_clean_text(log,
                                               self.dbg_lvl,
                                               author,
                                               swap=True), authors[j]).ratio()
                            if debug:
                                log.info("pertinence : ", perta, end=" ; ")
                                log.info("SM.ratio : {:.3f}".format(ratio),
                                         end=" ; ")
                                log.info("url_author : ",
                                         url_author,
                                         end=" ; ")
                                log.info("authors[j] : ",
                                         authors[j],
                                         end=" ; ")
                                log.info(
                                    "author : ",
                                    ret_clean_text(log, self.dbg_lvl, author))
                            if ratio >= .6:
                                all_author_index[url_author] = [ratio, author]

        sorted_author_index = dict(
            sorted(all_author_index.items(),
                   key=lambda x: x[1][0],
                   reverse=True))

        if debug: log.info("sorted_author_index :\n", sorted_author_index)

        # With python 3.6 onward, the standard dict type maintains insertion order by default.
        # Python 3.7 elevates this implementation detail to a language specification,
        # noosfere sort the hightest pertinence first (the most probable author comes out first)
        # so, I have no need to sort on pertinence field (would be different for calibre below Version 5)
        #
        # we only consider those with the highest pertinence, we limit to when the pertinence drops to less than half of the maximum
        #

        count = 0
        for key, ref in sorted_author_index.items():
            count += 1
            url_author, ratio, name_author = key, ref[0], ref[1]
            author_index.append(url_author)
            if debug:
                log.info("ratio : ", ratio, end=" ; ")
                log.info("author     : ", name_author, end=" ; ")
                log.info("url_author : ", url_author, end=" ; ")
                log.info("count : ", count)
#                log.info("author_index : ",author_index)       # may be long
            if count == 8: break

        if debug: log.info('return from ret_author_index')
        return author_index

    def ret_book_per_author_index(self, log, br, author_index, title,
                                  book_index):
        # Find the books references of a known author from the returned soup for noosfere
        # returns a dict "book_per_author_index{}" with key as title and val as the link to the book
        # Idea is to send back a few references that hopefully contains the title expected
        #
        # Trouver la reference des livres d'un auteur connu dans la soupe produite par noosfere
        # retourne "book_per_author_index{}", un dictionnaire avec key=titre, val=href
        # L'idée est de renvoyer serie de reference, dont on extrait les livres proches du titre de calibre
        #
        # now that we have a list of authors, let's get all the books associated with them
        # The "book_per_author_index" dictionnary will contain all book's references...
        # If a book has a common url it will be overwritten by the following author, ensuring a list of unique books
        #
        debug = self.dbg_lvl & 1
        log.info(
            "\nIn ret_book_per_author_index(self, log, br, author_index, title, book_index)"
        )
        if debug:
            log.info("author_index : ", author_index)
            log.info("title        : ", title)
            log.info("book_index   : ", book_index)

        book_per_author_index = {}
        unsorted_book_index = {}

        for i in range(len(author_index)):
            rqt = author_index[i] + "&Niveau=livres"
            url = "https://www.noosfere.org" + rqt
            soup = ret_soup(log, self.dbg_lvl, br, url)[0]
            tmp_bpai = soup.select('a[href*="ditionsLivre.asp"]')
            for i in range(len(tmp_bpai)):
                book_title = tmp_bpai[i].text.lower()
                book_url = (tmp_bpai[i]["href"].replace(
                    './', '/livres/').split('&'))[0]
                ratio = SM(None, title,
                           ret_clean_text(log, self.dbg_lvl,
                                          book_title)).ratio()
                if debug:
                    log.info("SM.ratio : {:.3f}".format(ratio), end=" ; ")
                    log.info("book_url : ", book_url, end=" ; ")
                    log.info('tmp_bpai[i]["href"] : ',
                             tmp_bpai[i]["href"],
                             end=" ; ")
                    log.info("book_title : ", book_title)
                if ratio > .6:
                    unsorted_book_index[ratio] = [book_url, "", book_title]
                if ratio == 1:
                    unsorted_book_index = {}
                    unsorted_book_index[ratio] = [book_url, "", book_title]
                    break  # we have a perfect match no need to go further in the author books
                    # and I know it could cause problem iff several authors produce an identical title

            sorted_book_index = dict(
                sorted(unsorted_book_index.items(), reverse=True))
            if debug: log.info("sorted bySM.ratio")
            for key, ref in sorted_book_index.items():
                if debug:
                    log.info("SM.ratio : {:.3f}".format(key), end=" ; ")
                    log.info("book_url : ", ref[0], end=" ; ")
                    log.info("book_title : ", ref[2])
                book_index[ref[0]] = ref[2]
            log.info('book_index[book_url] = book_title : ', book_index)

            if ratio == 1:
                log.info(
                    "Perfect match, we got it and we can stop looking further")
                break  # we have a perfect match no need to examine other authors

        if debug:
            log.info('return book_index from ret_book_per_author_index\n')
        return book_index

    def ISBN_ret_book_index(self, log, br, isbn, book_index):
        # Trouver la reference d'un livre (titre ou ISBN) dans la soupe produite par noosfere
        # retourne book_index{}, un dictionnaire avec key=book_url, val=title
        # L'idée est de trouver UNE seule reference...
        # Attention: on retourne une reference qui peut contenir PLUSIEURs volumes
        # C'est a dire: différents editeurs, différentes re-éditions et/ou, meme, un titre different... YESss)
        #
        # Find the book's reference (either title or ISBN) in the returned soup from noosfere
        # returns book_index{}, a dictionnary with key=book_url, val=title
        # The idea is to find ONE unique reference...
        # Caution: the reference may contains several volumes,
        # each with potentialy a different editor, a different edition date,... and even a different title
        #
        debug = self.dbg_lvl & 1
        log.info("\nIn ISBN_ret_book_index(self, log, br, isbn, book_index)")

        # if isbn valid then we want to select exact match (correspondance exacte = MOTS-CLEFS)
        rkt = {
            "Mots": isbn,
            "livres": "livres",
            "ModeMoteur": "MOTS-CLEFS",
            "ModeRecherche": "AND",
            "recherche": "1",
            "Envoyer": "Envoyer"
        }
        url = "https://www.noosfere.org/livres/noosearch.asp"
        soup = ret_soup(log, self.dbg_lvl, br, url, rkt=rkt)[0]
        tmp_rbi = soup.select('a[href*="ditionsLivre.asp"]')
        if len(tmp_rbi):
            for i in range(len(tmp_rbi)):
                if debug:
                    log.info(
                        "tmp_rbi[" + str(i) + "].text, tmp_rbi[" + str(i) +
                        "]['href'] : ", tmp_rbi[i].text, tmp_rbi[i]["href"])
                book_index[tmp_rbi[i]["href"]] = tmp_rbi[i].text

        if debug:
            log.info("book_index : ", book_index)
            log.info("return book_index from ISBN_ret_book_index\n")
        return book_index

    def identify(self,
                 log,
                 result_queue,
                 abort,
                 title=None,
                 authors=None,
                 identifiers={},
                 timeout=30):
        # this is the entry point...
        # Note this method will retry without identifiers automatically... read can be resubmitted from inside it
        # if no match is found with identifiers.
        #

        log.info('self.dgb_lvl            : ', self.dbg_lvl)
        log.info('self.extended_publisher : ', self.extended_publisher)
        log.info('self.priority_handling  : ', self.priority_handling)
        log.info('self.must_be_editor     : ', self.must_be_editor)

        debug = self.dbg_lvl & 1
        log.info(
            '\nEntering identify(self, log, result_queue, abort, title=None, authors=None,identifiers={}, timeout=30)'
        )
        if debug:
            log.info('log          : ', log)
            log.info('result_queue : ', result_queue)
            log.info('abort        : ', abort)
            log.info('title        : ', title)
            log.info('authors      : ', authors, type(authors))
            log.info('identifiers  : ', identifiers, type(identifiers))
            log.info('\n')

        br = self.browser

        isbn = identifiers.get('isbn', None)
        if isbn: isbn = verify_isbn(log, self.dbg_lvl, isbn)
        log.info('ISBN value is : ', isbn)

        # the nsfr_id is designed to be the significant part of the url:
        # that is the number after the "=" in the url containing "niourf.asp?numlivre"
        # on can force the access to a particular volume by setting the value of nsfr_id to vl$<number>
        # could be an entry point if I can make sure that noosfere DB is alone and in interactive mode...
        nsfr_id = identifiers.get('nsfr_id', None)
        log.info('nsfr_id value is : ', nsfr_id)

        log.info('"Clean" both the authors list and the title... ')
        if authors:
            for i in range(len(authors)):
                authors[i] = ret_clean_text(log, self.dbg_lvl, authors[i])
        if title:
            title = ret_clean_text(log, self.dbg_lvl, title)

        log.info('getting one or more book url')
        book_index = {
        }  # book_index={} is a dict: {key:ref} with: book_url, book_title = key, ref
        if nsfr_id:
            log.info('trying noosfere id, ', nsfr_id)
            nsfr = nsfr_id.split("$")
            if "bk" in nsfr[0]:
                url = "/livres/EditionsLivre.asp?numitem=" + nsfr[1]
                if "vl" in nsfr[2]:
                    url = "/livres/niourf.asp?numlivre=" + nsfr[3]
                book_index[url] = title
            elif "vl" in nsfr[0]:
                url = "/livres/niourf.asp?numlivre=" + nsfr[1]
                book_index[url] = title
            else:
                log.info('noosfere id not valid...')

        if not book_index:
            log.info('trying ISBN', isbn)
            if isbn:
                book_index = self.ISBN_ret_book_index(log, br, isbn,
                                                      book_index)
                if not len(book_index):
                    log.error("This ISBN was not found: ", isbn,
                              "trying with title", title, "and author",
                              authors)
                    return self.identify(log,
                                         result_queue,
                                         abort,
                                         title=title,
                                         authors=authors,
                                         timeout=timeout)
            elif title and authors:
                log.info('trying using authors and title')
                author_index = self.ret_author_index(log, br, authors)
                if len(author_index):
                    book_index = self.ret_book_per_author_index(
                        log, br, author_index, title, book_index)
                if not len(author_index):
                    log.info("Désolé, aucun auteur trouvé avec : ", authors)
                    return
        # here maybe try with title alone... a dessiner lrp todo... ouais peut-etre pour le cas ou l'auteur serait trop noyé dans une masse de noms similaires

        if not book_index:
            log.error("No book found in noosfere... ")
            return

        if abort.is_set():
            log.info('abort was set... aborting... ')
            return

        tmp_list, i = [], 0
        for key, ref in book_index.items():
            book_url, book_title = key, ref
            if debug:
                log.info("sending to worker", i, "book_url, book_title : ",
                         book_url, ", ", book_title)
            i += 1
            tmp_list.append((book_url, book_title))

        log.info('\nCreating each worker... ')
        from calibre_plugins.noosfere.worker import Worker
        workers = [
            Worker(log, data[0], data[1], isbn, result_queue, br, i, self,
                   self.dbg_lvl) for i, data in enumerate(tmp_list)
        ]

        for w in workers:
            w.start()
            # Don't send all requests at the same time
            time.sleep(0.2)

        while not abort.is_set():
            a_worker_is_alive = False
            for w in workers:
                w.join(0.2)
                if abort.is_set():
                    log.info('abort was set while in loop... aborting... ')
                    break
                if w.is_alive():
                    a_worker_is_alive = True
            if not a_worker_is_alive:
                break

        if debug: log.info("return None from identify")
        return None

    def download_cover(self,
                       log,
                       result_queue,
                       abort,
                       title=None,
                       authors=None,
                       identifiers={},
                       timeout=30):
        # willl download cover from Noosfere provided it was found (and then cached)... If not, it will
        # run the metadata download and try to cache the cover url...

        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log,
                          rq,
                          abort,
                          title=title,
                          authors=authors,
                          identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return

        if abort.is_set():
            return

        br = self.browser
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(cached_url, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)