Exemplo n.º 1
0
    def get_author_info(self, authorid=None, authorname=None, refresh=False):

        URL = 'http://www.goodreads.com/author/show/' + authorid + '.xml?' + urllib.urlencode(
            self.params)
        author_dict = {}

        try:
            rootxml, in_cache = get_xml_request(URL)
        except Exception as e:
            logger.error("Error getting author info: %s" % e)
            return author_dict
        if rootxml is None:
            logger.debug("Error requesting author info")
            return author_dict

        resultxml = rootxml.find('author')

        if not len(resultxml):
            logger.warn('No author found with ID: ' + authorid)
        else:
            logger.debug("[%s] Processing info for authorID: %s" %
                         (authorname, authorid))

            # PAB added authorname to author_dict - this holds the intact name preferred by GR
            author_dict = {
                'authorid': resultxml[0].text,
                'authorlink': resultxml.find('link').text,
                'authorimg': resultxml.find('image_url').text,
                'authorborn': resultxml.find('born_at').text,
                'authordeath': resultxml.find('died_at').text,
                'totalbooks': resultxml.find('works_count').text,
                'authorname': authorname
            }
        return author_dict
Exemplo n.º 2
0
    def get_author_info(self, authorid=None, authorname=None, refresh=False):

        URL = 'http://www.goodreads.com/author/show/' + authorid + '.xml?' + urllib.urlencode(self.params)
        author_dict = {}

        try:
            rootxml, in_cache = get_xml_request(URL)
        except Exception as e:
            logger.error("Error getting author info: %s" % e)
            return author_dict
        if rootxml is None:
            logger.debug("Error requesting author info")
            return author_dict

        resultxml = rootxml.find('author')

        if not len(resultxml):
            logger.warn('No author found with ID: ' + authorid)
        else:
            logger.debug("[%s] Processing info for authorID: %s" % (authorname, authorid))

            # PAB added authorname to author_dict - this holds the intact name preferred by GR
            author_dict = {
                'authorid': resultxml[0].text,
                'authorlink': resultxml.find('link').text,
                'authorimg': resultxml.find('image_url').text,
                'authorborn': resultxml.find('born_at').text,
                'authordeath': resultxml.find('died_at').text,
                'totalbooks': resultxml.find('works_count').text,
                'authorname': authorname
            }
        return author_dict
Exemplo n.º 3
0
    def find_author_id(self, refresh=False):
        author = self.name
        # Goodreads doesn't like initials followed by spaces,
        # eg "M L Hamilton", needs "M. L. Hamilton" or "M.L.Hamilton"
        # but DOES need spaces if not initials eg "Tom.Holt" fails, but "Tom Holt" works
        if author[1] == ' ':
            author = author.replace(' ', '.')
            author = author.replace('..', '.')
        URL = 'http://www.goodreads.com/api/author_url/' + urllib.quote(
            author) + '?' + urllib.urlencode(self.params)

        # googlebooks gives us author names with long form unicode characters
        if isinstance(author, str):
            author = author.decode('utf-8')  # make unicode
        author = unicodedata.normalize('NFC',
                                       author)  # normalize to short form

        logger.debug("Searching for author with name: %s" % author)

        authorlist = []
        try:
            rootxml, in_cache = get_xml_request(URL)
        except Exception as e:
            logger.error("Error finding authorid: %s, %s" % (e, URL))
            return authorlist
        if rootxml is None:
            logger.debug("Error requesting authorid")
            return authorlist

        resultxml = rootxml.getiterator('author')

        if not len(resultxml):
            logger.warn('No authors found with name: %s' % author)
        else:
            # In spite of how this looks, goodreads only returns one result, even if there are multiple matches
            # we just have to hope we get the right one. eg search for "James Lovelock" returns "James E. Lovelock"
            # who only has one book listed under googlebooks, the rest are under "James Lovelock"
            # goodreads has all his books under "James E. Lovelock". Can't come up with a good solution yet.
            # For now we'll have to let the user handle this by selecting/adding the author manually
            for author in resultxml:
                authorid = author.attrib.get("id")
                authorname = author[0].text
                authorlist = self.get_author_info(authorid, authorname,
                                                  refresh)
        return authorlist
Exemplo n.º 4
0
    def find_author_id(self, refresh=False):
        author = self.name
        # Goodreads doesn't like initials followed by spaces,
        # eg "M L Hamilton", needs "M. L. Hamilton" or "M.L.Hamilton"
        # but DOES need spaces if not initials eg "Tom.Holt" fails, but "Tom Holt" works
        if author[1] == ' ':
            author = author.replace(' ', '.')
            author = author.replace('..', '.')
        URL = 'http://www.goodreads.com/api/author_url/' + urllib.quote(author) + '?' + urllib.urlencode(self.params)

        # googlebooks gives us author names with long form unicode characters
        if isinstance(author, str):
            author = author.decode('utf-8')  # make unicode
        author = unicodedata.normalize('NFC', author)  # normalize to short form

        logger.debug("Searching for author with name: %s" % author)

        authorlist = []
        try:
            rootxml, in_cache = get_xml_request(URL)
        except Exception as e:
            logger.error("Error finding authorid: %s, %s" % (e, URL))
            return authorlist
        if rootxml is None:
            logger.debug("Error requesting authorid")
            return authorlist

        resultxml = rootxml.getiterator('author')

        if not len(resultxml):
            logger.warn('No authors found with name: %s' % author)
        else:
            # In spite of how this looks, goodreads only returns one result, even if there are multiple matches
            # we just have to hope we get the right one. eg search for "James Lovelock" returns "James E. Lovelock"
            # who only has one book listed under googlebooks, the rest are under "James Lovelock"
            # goodreads has all his books under "James E. Lovelock". Can't come up with a good solution yet.
            # For now we'll have to let the user handle this by selecting/adding the author manually
            for author in resultxml:
                authorid = author.attrib.get("id")
                authorname = author[0].text
                authorlist = self.get_author_info(authorid, authorname, refresh)
        return authorlist
Exemplo n.º 5
0
    def get_author_info(self, authorid=None):

        URL = 'http://www.goodreads.com/author/show/' + authorid + '.xml?' + urllib.urlencode(self.params)
        author_dict = {}

        try:
            rootxml, in_cache = get_xml_request(URL)
        except Exception as e:
            logger.error("Error getting author info: %s" % str(e))
            return author_dict
        if rootxml is None:
            logger.debug("Error requesting author info")
            return author_dict

        resultxml = rootxml.find('author')

        if not len(resultxml):
            logger.warn('No author found with ID: ' + authorid)
        else:
            # added authorname to author_dict - this holds the intact name preferred by GR
            # except GR messes up names like "L. E. Modesitt, Jr." where it returns <name>Jr., L. E. Modesitt</name>
            authorname = resultxml[1].text
            if "," in authorname:
                postfix = getList(lazylibrarian.CONFIG['NAME_POSTFIX'])
                words = authorname.split(',')
                if len(words) == 2:
                    if words[0].strip().strip('.').lower in postfix:
                        authorname = words[1].strip() + ' ' + words[0].strip()

            logger.debug("[%s] Processing info for authorID: %s" % (authorname, authorid))
            author_dict = {
                'authorid': resultxml[0].text,
                'authorlink': resultxml.find('link').text,
                'authorimg': resultxml.find('image_url').text,
                'authorborn': resultxml.find('born_at').text,
                'authordeath': resultxml.find('died_at').text,
                'totalbooks': resultxml.find('works_count').text,
                'authorname': ' '.join(authorname.split())  # remove any extra whitespace
            }
        return author_dict
Exemplo n.º 6
0
    def find_author_id(self, refresh=False):
        author = self.name
        author = formatAuthorName(author)
        URL = 'http://www.goodreads.com/api/author_url/' + urllib.quote(author) + '?' + urllib.urlencode(self.params)

        # googlebooks gives us author names with long form unicode characters
        if isinstance(author, str):
            author = author.decode('utf-8')  # make unicode
        author = unicodedata.normalize('NFC', author)  # normalize to short form

        logger.debug("Searching for author with name: %s" % author)

        authorlist = []
        try:
            rootxml, in_cache = get_xml_request(URL, useCache=not refresh)
        except Exception as e:
            logger.error("Error finding authorid: %s, %s" % (URL, str(e)))
            return authorlist
        if rootxml is None:
            logger.debug("Error requesting authorid")
            return authorlist

        resultxml = rootxml.getiterator('author')

        if not len(resultxml):
            logger.warn('No authors found with name: %s' % author)
        else:
            # In spite of how this looks, goodreads only returns one result, even if there are multiple matches
            # we just have to hope we get the right one. eg search for "James Lovelock" returns "James E. Lovelock"
            # who only has one book listed under googlebooks, the rest are under "James Lovelock"
            # goodreads has all his books under "James E. Lovelock". Can't come up with a good solution yet.
            # For now we'll have to let the user handle this by selecting/adding the author manually
            for author in resultxml:
                authorid = author.attrib.get("id")
                authorlist = self.get_author_info(authorid)
        return authorlist
Exemplo n.º 7
0
    def find_book(self, bookid=None, queue=None):
        myDB = database.DBConnection()

        URL = 'https://www.goodreads.com/book/show/' + bookid + '?' + urllib.urlencode(self.params)

        try:
            rootxml, in_cache = get_xml_request(URL)
            if rootxml is None:
                logger.debug("Error requesting book")
                return
        except Exception as e:
            logger.error("Error finding book: %s" % e)
            return

        bookLanguage = rootxml.find('./book/language_code').text
        bookname = rootxml.find('./book/title').text

        if not bookLanguage:
            bookLanguage = "Unknown"
#
# PAB user has said they want this book, don't block for bad language, just warn
#
        valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')])
        if bookLanguage not in valid_langs:
            logger.debug('Book %s language does not match preference' % bookname)

        if (rootxml.find('./book/publication_year').text is None):
            bookdate = "0000"
        else:
            bookdate = rootxml.find('./book/publication_year').text

        try:
            bookimg = rootxml.find('./book/img_url').text
            if 'assets/nocover' in bookimg:
                bookimg = 'images/nocover.png'
        except (KeyError, AttributeError):
            bookimg = 'images/nocover.png'

        authorname = rootxml.find('./book/authors/author/name').text
        bookdesc = rootxml.find('./book/description').text
        bookisbn = rootxml.find('./book/isbn').text
        bookpub = rootxml.find('./book/publisher').text
        booklink = rootxml.find('./book/link').text
        bookrate = float(rootxml.find('./book/average_rating').text)
        bookpages = rootxml.find('.book/num_pages').text

        name = authorname
        GR = GoodReads(name)
        author = GR.find_author_id()
        if author:
            AuthorID = author['authorid']

        booksub = ''
        bookname = unaccented(bookname)
        if ': ' in bookname:
            parts = bookname.split(': ', 1)
            bookname = parts[0]
            booksub = parts[1]

        dic = {':': '', '"': '', '\'': ''}
        bookname = replace_all(bookname, dic)
        bookname = bookname.strip()  # strip whitespace
        booksub = replace_all(booksub, dic)
        booksub = booksub.strip()  # strip whitespace
        if booksub:
           series,seriesNum = bookSeries(booksub)
        else:
           series,seriesNum = bookSeries(bookname)

        controlValueDict = {"BookID": bookid}
        newValueDict = {
            "AuthorName": authorname,
            "AuthorID": AuthorID,
            "AuthorLink": None,
            "BookName": bookname,
            "BookSub": booksub,
            "BookDesc": bookdesc,
            "BookIsbn": bookisbn,
            "BookPub": bookpub,
            "BookGenre": None,
            "BookImg": bookimg,
            "BookLink": booklink,
            "BookRate": bookrate,
            "BookPages": bookpages,
            "BookDate": bookdate,
            "BookLang": bookLanguage,
            "Status": "Wanted",
            "BookAdded": today(),
            "Series": series,
            "SeriesNum": seriesNum
        }

        myDB.upsert("books", newValueDict, controlValueDict)
        logger.debug("%s added to the books database" % bookname)

        if 'nocover' in bookimg or 'nophoto' in bookimg:
            # try to get a cover from librarything
            workcover = getBookCover(bookid)
            if workcover:
                logger.debug(u'Updated cover for %s to %s' % (bookname, workcover))
                controlValueDict = {"BookID": bookid}
                newValueDict = {"BookImg": workcover}
                myDB.upsert("books", newValueDict, controlValueDict)

        elif bookimg and bookimg.startswith('http'):
            link = cache_cover(bookid, bookimg)
            if link is not None:
                controlValueDict = {"BookID": bookid}
                newValueDict = {"BookImg": link}
                myDB.upsert("books", newValueDict, controlValueDict)

        if seriesNum == None:
            #  try to get series info from librarything
            series, seriesNum = getWorkSeries(bookid)
            if seriesNum:
                logger.debug(u'Updated series: %s [%s]' % (series, seriesNum))
                controlValueDict = {"BookID": bookid}
                newValueDict = {
                    "Series": series,
                    "SeriesNum": seriesNum
                }
                myDB.upsert("books", newValueDict, controlValueDict)

        worklink = getWorkPage(bookid)
        if worklink:
            controlValueDict = {"BookID": bookid}
            newValueDict = {"WorkPage": worklink}
            myDB.upsert("books", newValueDict, controlValueDict)
Exemplo n.º 8
0
    def find_results(self, authorname=None, queue=None):
        resultlist = []
        api_hits = 0
        # Goodreads doesn't like initials followed by spaces,
        # eg "M L Hamilton", needs "M. L. Hamilton" or "M.L.Hamilton"
        # but DOES need spaces if not initials eg "Tom.Holt" fails, but "Tom Holt" works
        if authorname[1] == ' ':
            authorname = authorname.replace(' ', '.')
            authorname = authorname.replace('..', '.')

        url = urllib.quote_plus(authorname.encode(lazylibrarian.SYS_ENCODING))
        set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params)
        logger.debug('Now searching GoodReads API with keyword: ' + authorname)
        logger.debug('Searching for %s at: %s' % (authorname, set_url))

        try:
            try:
                rootxml, in_cache = get_xml_request(set_url)
            except Exception as e:
                logger.error("Error finding results: %s" % e)
                return
            if not len(rootxml):
                logger.debug("Error requesting results")
                return

            resultxml = rootxml.getiterator('work')
            resultcount = 0
            for author in resultxml:
                bookdate = "0001-01-01"

                if (author.find('original_publication_year').text is None):
                    bookdate = "0000"
                else:
                    bookdate = author.find('original_publication_year').text

                authorNameResult = author.find('./best_book/author/name').text
                booksub = ""
                bookpub = ""
                booklang = "Unknown"

                try:
                    bookimg = author.find('./best_book/image_url').text
                    if (bookimg == 'http://www.goodreads.com/assets/nocover/111x148.png'):
                        bookimg = 'images/nocover.png'
                except (KeyError, AttributeError):
                    bookimg = 'images/nocover.png'

                try:
                    bookrate = author.find('average_rating').text
                except KeyError:
                    bookrate = 0

                bookpages = '0'
                bookgenre = ''
                bookdesc = ''
                bookisbn = ''
                booklink = 'http://www.goodreads.com/book/show/' + author.find('./best_book/id').text

                if (author.find('./best_book/title').text is None):
                    bookTitle = ""
                else:
                    bookTitle = author.find('./best_book/title').text

                author_fuzz = fuzz.token_set_ratio(authorNameResult, authorname)
                book_fuzz = fuzz.token_set_ratio(bookTitle, authorname)
                try:
                    isbn_check = int(authorname[:-1])
                    if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12):
                        isbn_fuzz = int(100)
                    else:
                        isbn_fuzz = int(0)
                except:
                    isbn_fuzz = int(0)
                highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz)

                bookid = author.find('./best_book/id').text

                resultlist.append({
                    'authorname': author.find('./best_book/author/name').text,
                    'bookid': bookid,
                    'authorid': author.find('./best_book/author/id').text,
                    'bookname': bookTitle.encode("ascii", "ignore"),
                    'booksub': None,
                    'bookisbn': bookisbn,
                    'bookpub': bookpub,
                    'bookdate': bookdate,
                    'booklang': booklang,
                    'booklink': booklink,
                    'bookrate': float(bookrate),
                    'bookimg': bookimg,
                    'bookpages': bookpages,
                    'bookgenre': bookgenre,
                    'bookdesc': bookdesc,
                    'author_fuzz': author_fuzz,
                    'book_fuzz': book_fuzz,
                    'isbn_fuzz': isbn_fuzz,
                    'highest_fuzz': highest_fuzz,
                    'num_reviews': float(bookrate)
                })

                resultcount = resultcount + 1

        except urllib2.HTTPError as err:
            if err.code == 404:
                logger.error('Received a 404 error when searching for author')
            if err.code == 403:
                logger.warn('Access to api is denied: usage exceeded')
            else:
                logger.error('An unexpected error has occurred when searching for an author')

        logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), authorname))
        logger.debug('The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), authorname))

        queue.put(resultlist)
Exemplo n.º 9
0
    def get_author_books(self, authorid=None, authorname=None, refresh=False):

        api_hits = 0
        gr_lang_hits = 0
        lt_lang_hits = 0
        gb_lang_change = 0
        cache_hits = 0
        not_cached = 0
        URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + urllib.urlencode(self.params)

        # Artist is loading
        myDB = database.DBConnection()
        controlValueDict = {"AuthorID": authorid}
        newValueDict = {"Status": "Loading"}
        myDB.upsert("authors", newValueDict, controlValueDict)
        books_dict = []
        try:
            rootxml, in_cache = get_xml_request(URL, useCache=not refresh)
        except Exception as e:
            logger.error("Error fetching author books: %s" % e)
            return books_dict
        if rootxml is None:
            logger.debug("Error requesting author books")
            return books_dict
        if not in_cache:
            api_hits = api_hits + 1
        resultxml = rootxml.getiterator('book')

        valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')])

        if not len(resultxml):
            logger.warn('[%s] No books found for author with ID: %s' % (authorname, authorid))
        else:
            logger.debug("[%s] Now processing books with GoodReads API" % authorname)

            resultsCount = 0
            removedResults = 0
            duplicates = 0
            ignored = 0
            added_count = 0
            updated_count = 0
            book_ignore_count = 0
            total_count = 0
            logger.debug(u"url " + URL)

            authorNameResult = rootxml.find('./author/name').text
            logger.debug(u"author name " + authorNameResult)
            loopCount = 1

            while resultxml is not None:
                for book in resultxml:
                    total_count = total_count + 1

                    if (book.find('publication_year').text is None):
                        pubyear = "0000"
                    else:
                        pubyear = book.find('publication_year').text

                    try:
                        bookimg = book.find('image_url').text
                        if ('nocover' in bookimg):
                            bookimg = 'images/nocover.png'
                    except (KeyError,AttributeError):
                        bookimg = 'images/nocover.png'

    # PAB this next section tries to get the book language using the isbn13 to look it up. If no isbn13 we skip the
    # book entirely, rather than including it with an "Unknown" language. Changed this so we can still include the book
    # with language set to "Unknown". There is a setting in config.ini to allow or skip books with "Unknown" language
    # if you really don't want to include them.
    # Not all GR books have isbn13 filled in, but all have a GR bookid, which we've already got, so use that.
    # Also, with GR API rules we can only call the API once per second, which slows us down a lot when all we want
    # is to get the language. We sleep for one second per book that GR knows about for each author you have in your
    # library. The libraryThing API has the same 1 second restriction, and is limited to 1000 hits per day, but has
    # fewer books with unknown language. To get around this and speed up the process, see if we already have a book
    # in the database with a similar start to the ISBN. The way ISBNs work, digits 3-5 of a 13 char ISBN or digits 0-2
    # of a 10 digit ISBN indicate the region/language so if two books have the same 3 digit isbn code, they _should_
    # be the same language.
    # I ran a simple python script on my library of 1500 books, and these codes were 100% correct on matching book
    # languages, no mis-matches. It did result in a small number of books with "unknown" language being wrongly matched
    # but most "unknown" were matched to the correct language.
    # We could look up ISBNs we already know about in the database, but this only holds books in the languages we want
    # to keep, which reduces the number of cache hits, so we create a new database table, holding ALL results including
    # the ISBNs for languages we don't want and books we reject.
    # The new table is created (if not exists) in init.py so by the time we get here there is an existing table.
    # If we haven't an already matching partial ISBN, look up language code from libraryThing
    # "http://www.librarything.com/api/thingLang.php?isbn=1234567890"
    # If you find a matching language, add it to the database.  If "unknown" or "invalid", try GR as maybe GR can
    # provide a match.
    # If both LT and GR return unknown, add isbn to db as "unknown". No point in repeatedly asking LT for a code
    # it's told you it doesn't know.
    # As an extra option, if language includes "All" in config.ini, we can skip this whole section and process
    # everything much faster by not querying for language at all.
    # It does mean we include a lot of unwanted foreign translations in the database, but it's _much_ faster.

                    bookLanguage = "Unknown"
                    find_field = "id"
                    isbn = ""
                    isbnhead = ""
                    if "All" not in valid_langs:  # do we care about language
                        if (book.find('isbn').text is not None):
                            find_field = "isbn"
                            isbn = book.find('isbn').text
                            isbnhead = isbn[0:3]
                        else:
                            if (book.find('isbn13').text is not None):
                                find_field = "isbn13"
                                isbn = book.find('isbn13').text
                                isbnhead = isbn[3:6]
                        if (find_field != 'id'):  # isbn or isbn13 found

                            match = myDB.action('SELECT lang FROM languages where isbn = "%s"' %
                                                (isbnhead)).fetchone()
                            if (match):
                                bookLanguage = match['lang']
                                cache_hits = cache_hits + 1
                                logger.debug("Found cached language [%s] for %s [%s]" %
                                             (bookLanguage, find_field, isbnhead))
                            else:
                                # no match in cache, try searching librarything for a language code using the isbn
                                # if no language found, librarything return value is "invalid" or "unknown"
                                # returns plain text, not xml
                                BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + isbn
                                try:
                                    librarything_wait()
                                    resp = urllib2.urlopen(BOOK_URL, timeout=30).read()
                                    lt_lang_hits = lt_lang_hits + 1
                                    logger.debug("LibraryThing reports language [%s] for %s" % (resp, isbnhead))

                                    if ('invalid' in resp or 'Unknown' in resp):
                                        find_field = "id"  # reset the field to force search on goodreads
                                    else:
                                        bookLanguage = resp  # found a language code
                                        myDB.action('insert into languages values ("%s", "%s")' %
                                                    (isbnhead, bookLanguage))
                                        logger.debug(u"LT language %s: %s" % (isbnhead, bookLanguage))
                                except Exception as e:
                                    logger.error("Error finding LT language result for [%s], %s" % (isbn, e))
                                    find_field = "id"  # reset the field to search on goodreads

                        if (find_field == 'id'):
                            # [or bookLanguage == "Unknown"] no earlier match, we'll have to search the goodreads api
                            try:
                                if (book.find(find_field).text is not None):
                                    BOOK_URL = 'http://www.goodreads.com/book/show?id=' + \
                                        book.find(find_field).text + '&' + urllib.urlencode(self.params)
                                    logger.debug(u"Book URL: " + BOOK_URL)

                                    try:
                                        time_now = int(time.time())
                                        if time_now <= lazylibrarian.LAST_GOODREADS:
                                            time.sleep(1)

                                        BOOK_rootxml, in_cache = get_xml_request(BOOK_URL)
                                        if BOOK_rootxml is None:
                                            logger.debug('Error requesting book language code')
                                            bookLanguage = ""
                                        else:
                                            if not in_cache:
                                                # only update last_goodreads if the result wasn't found in the cache
                                                lazylibrarian.LAST_GOODREADS = time_now
                                            bookLanguage = BOOK_rootxml.find('./book/language_code').text
                                    except Exception as e:
                                        logger.error("Error finding book results: %s" % e)
                                    if not in_cache:
                                        gr_lang_hits = gr_lang_hits + 1
                                    if not bookLanguage:
                                        bookLanguage = "Unknown"

                                    if (isbnhead != ""):
                                        # GR didn't give an isbn so we can't cache it, just use language for this book
                                        myDB.action('insert into languages values ("%s", "%s")' %
                                                    (isbnhead, bookLanguage))
                                        logger.debug("GoodReads reports language [%s] for %s" %
                                                     (bookLanguage, isbnhead))
                                    else:
                                        not_cached = not_cached + 1

                                    logger.debug(u"GR language: " + bookLanguage)
                                else:
                                    logger.debug("No %s provided for [%s]" % (find_field, book.find('title').text))
                                    # continue

                            except Exception as e:
                                logger.debug(u"An error has occured: %s" % e)

                        if bookLanguage not in valid_langs:
                            logger.debug('Skipped a book with language %s' % bookLanguage)
                            ignored = ignored + 1
                            continue
                    bookname = book.find('title').text
                    bookid = book.find('id').text
                    bookdesc = book.find('description').text
                    bookisbn = book.find('isbn').text
                    bookpub = book.find('publisher').text
                    booklink = book.find('link').text
                    bookrate = float(book.find('average_rating').text)
                    bookpages = book.find('num_pages').text
                    bookname = unaccented(bookname)
                    if ': ' in bookname:
                        parts = bookname.split(': ', 1)
                        bookname = parts[0]
                        booksub = parts[1]
                    else:
                        booksub = ''
                    dic = {':': '', '"': '', '\'': ''}
                    bookname = replace_all(bookname, dic)
                    bookname = bookname.strip()  # strip whitespace
                    booksub = replace_all(booksub, dic)
                    booksub = booksub.strip()  # strip whitespace
                    if booksub:
                        series,seriesNum = bookSeries(booksub)
                    else:
                        series,seriesNum = bookSeries(bookname)

                    # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions)
                    # and sometimes uses the same bookid if the book is the same but the title is slightly different
                    # We use bookid, then reject if another author/title has a different bookid so we just keep one...
                    find_book_status = myDB.select('SELECT * FROM books WHERE BookID = "%s"' % bookid)
                    if find_book_status:
                        for resulted in find_book_status:
                            book_status = resulted['Status']
                            locked = resulted ['Manual']
                    else:
                        book_status = lazylibrarian.NEWBOOK_STATUS
                        locked = False

                    rejected = False

                    if re.match('[^\w-]', bookname):  # reject books with bad characters in title
                        logger.debug(u"removed result [" + bookname + "] for bad characters")
                        removedResults = removedResults + 1
                        rejected = True

                    if not rejected and not bookname:
                        logger.debug('Rejecting bookid %s for %s, no bookname' %
                                (bookid, authorNameResult))
                        removedResults = removedResults + 1
                        rejected = True

                    if not rejected:
                        find_books = myDB.select('SELECT * FROM books WHERE BookName = "%s" and AuthorName = "%s"' %
                                                    (bookname, authorNameResult))
                        if find_books:
                            for find_book in find_books:
                                if find_book['BookID'] != bookid:
                                    # we have a book with this author/title already
                                    logger.debug('Rejecting bookid %s for [%s][%s] already got %s' %
                                        (find_book['BookID'], authorNameResult, bookname, bookid))
                                    duplicates = duplicates + 1
                                    rejected = True
                                    break

                    if not rejected:
                        find_books = myDB.select('SELECT * FROM books WHERE BookID = "%s"' % bookid)
                        if find_books:
                            # we have a book with this bookid already
                            logger.debug('Rejecting bookid %s for [%s][%s] already got this bookid in database' %
                                (bookid, authorNameResult, bookname))
                            duplicates = duplicates + 1
                            rejected = True
                            break

                    if not rejected:
                        if book_status != "Ignored":
                            if not locked:
                                controlValueDict = {"BookID": bookid}
                                newValueDict = {
                                    "AuthorName": authorNameResult,
                                    "AuthorID": authorid,
                                    "AuthorLink": None,
                                    "BookName": bookname,
                                    "BookSub": booksub,
                                    "BookDesc": bookdesc,
                                    "BookIsbn": bookisbn,
                                    "BookPub": bookpub,
                                    "BookGenre": None,
                                    "BookImg": bookimg,
                                    "BookLink": booklink,
                                    "BookRate": bookrate,
                                    "BookPages": bookpages,
                                    "BookDate": pubyear,
                                    "BookLang": bookLanguage,
                                    "Status": book_status,
                                    "BookAdded": today(),
                                    "Series": series,
                                    "SeriesNum": seriesNum
                                }

                                resultsCount = resultsCount + 1

                                myDB.upsert("books", newValueDict, controlValueDict)
                                logger.debug(u"Book found: " + book.find('title').text + " " + pubyear)

                            if 'nocover' in bookimg or 'nophoto' in bookimg:
                                # try to get a cover from librarything
                                workcover = getBookCover(bookid)
                                if workcover:
                                    logger.debug(u'Updated cover for %s to %s' % (bookname, workcover))
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {"BookImg": workcover}
                                    myDB.upsert("books", newValueDict, controlValueDict)

                            elif bookimg and bookimg.startswith('http'):
                                link = cache_cover(bookid, bookimg)
                                if link is not None:
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {"BookImg": link}
                                    myDB.upsert("books", newValueDict, controlValueDict)

                            if seriesNum == None:
                                # try to get series info from librarything
                                series, seriesNum = getWorkSeries(bookid)
                                if seriesNum:
                                    logger.debug(u'Updated series: %s [%s]' % (series, seriesNum))
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {
                                        "Series": series,
                                        "SeriesNum": seriesNum
                                    }
                                    myDB.upsert("books", newValueDict, controlValueDict)

                            worklink = getWorkPage(bookid)
                            if worklink:
                                controlValueDict = {"BookID": bookid}
                                newValueDict = {"WorkPage": worklink}
                                myDB.upsert("books", newValueDict, controlValueDict)

                            if not find_book_status:
                                logger.debug(u"[%s] Added book: %s" % (authorname, bookname))
                                added_count = added_count + 1
                            else:
                                logger.debug(u"[%s] Updated book: %s" % (authorname, bookname))
                                updated_count = updated_count + 1
                        else:
                            book_ignore_count = book_ignore_count + 1

                loopCount = loopCount + 1
                URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + \
                      urllib.urlencode(self.params) + '&page=' + str(loopCount)
                resultxml = None
                try:
                    rootxml, in_cache = get_xml_request(URL, useCache=not refresh)
                    if rootxml is None:
                        logger.debug('Error requesting next page of results')
                    else:
                        resultxml = rootxml.getiterator('book')
                        if not in_cache:
                            api_hits = api_hits + 1
                except Exception as e:
                    resultxml = None
                    logger.error("Error finding next page of results: %s" % e)

                if resultxml is not None:
                    if all(False for book in resultxml):  # returns True if iterator is empty
                        resultxml = None

        lastbook = myDB.action('SELECT BookName, BookLink, BookDate from books WHERE AuthorID="%s" \
                                AND Status != "Ignored" order by BookDate DESC' % authorid).fetchone()
        if lastbook:
            lastbookname = lastbook['BookName']
            lastbooklink = lastbook['BookLink']
            lastbookdate = lastbook['BookDate']
        else:
            lastbookname = None
            lastbooklink = None
            lastbookdate = None

        controlValueDict = {"AuthorID": authorid}
        newValueDict = {
            "Status": "Active",
            "LastBook": lastbookname,
            "LastLink": lastbooklink,
            "LastDate": lastbookdate
        }
        myDB.upsert("authors", newValueDict, controlValueDict)

        # This is here because GoodReads sometimes has several entries with the same BookID!
        modified_count = added_count + updated_count

        logger.debug("Found %s total book%s for author" % (total_count, plural(total_count)))
        logger.debug("Removed %s bad language result%s for author" % (ignored, plural(ignored)))
        logger.debug("Removed %s bad character or no-name result%s for author" % (removedResults, plural(removedResults)))
        logger.debug("Removed %s duplicate result%s for author" % (duplicates, plural(duplicates)))
        logger.debug("Ignored %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count)))
        logger.debug("Imported/Updated %s book%s for author" % (modified_count, plural(modified_count)))

        myDB.action('insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' %
                    (authorname, api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change,
                     cache_hits, ignored, removedResults, not_cached, duplicates))

        if refresh:
            logger.info("[%s] Book processing complete: Added %s book%s / Updated %s book%s" %
                        (authorname, added_count, plural(added_count), updated_count, plural(updated_count)))
        else:
            logger.info("[%s] Book processing complete: Added %s book%s to the database" %
                        (authorname, added_count, plural(added_count)))

        return books_dict
Exemplo n.º 10
0
def LibraryScan(startdir=None):
    """ Scan a directory tree adding new books into database
        Return how many books you added """
    try:
        destdir = lazylibrarian.DIRECTORY('Destination')
        if not startdir:
            if not destdir:
                logger.warn('Cannot find destination directory: %s. Not scanning' % destdir)
                return 0
            startdir = destdir

        if not os.path.isdir(startdir):
            logger.warn('Cannot find directory: %s. Not scanning' % startdir)
            return 0

        if not internet():
            logger.warn('Libraryscan: No internet connection')
            return 0

        myDB = database.DBConnection()

        # keep statistics of full library scans
        if startdir == destdir:
            myDB.action('DELETE from stats')
            try:  # remove any extra whitespace in authornames
                authors = myDB.select('SELECT AuthorID,AuthorName FROM authors WHERE AuthorName like "%  %"')
                if authors:
                    logger.info('Removing extra spaces from %s authorname%s' % (len(authors), plural(len(authors))))
                    for author in authors:
                        authorid = author["AuthorID"]
                        authorname = ' '.join(author['AuthorName'].split())
                        # Have we got author name both with-and-without extra spaces? If so, merge them
                        duplicate = myDB.match(
                            'Select AuthorID,AuthorName FROM authors WHERE AuthorName="%s"' % authorname)
                        if duplicate:
                            myDB.action('DELETE from authors where authorname="%s"' % author['AuthorName'])
                            if author['AuthorID'] != duplicate['AuthorID']:
                                myDB.action('UPDATE books set AuthorID="%s" WHERE AuthorID="%s"' %
                                            (duplicate['AuthorID'], author['AuthorID']))
                        else:
                            myDB.action(
                                'UPDATE authors set AuthorName="%s" WHERE AuthorID="%s"' % (authorname, authorid))
            except Exception as e:
                logger.info('Error: ' + str(e))

        logger.info('Scanning ebook directory: %s' % startdir)

        new_book_count = 0
        modified_count = 0
        rescan_count = 0
        rescan_hits = 0
        file_count = 0
        author = ""

        if lazylibrarian.CONFIG['FULL_SCAN']:
            cmd = 'select AuthorName, BookName, BookFile, BookID from books,authors'
            cmd += ' where books.AuthorID = authors.AuthorID and books.Status="Open"'
            if not startdir == destdir:
                cmd += ' and BookFile like "' + startdir + '%"'
            books = myDB.select(cmd)
            status = lazylibrarian.CONFIG['NOTFOUND_STATUS']
            logger.info('Missing books will be marked as %s' % status)
            for book in books:
                bookID = book['BookID']
                bookfile = book['BookFile']

                if not (bookfile and os.path.isfile(bookfile)):
                    myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID))
                    myDB.action('update books set BookFile="" where BookID="%s"' % bookID)
                    logger.warn('Book %s - %s updated as not found on disk' % (book['AuthorName'], book['BookName']))

        # to save repeat-scans of the same directory if it contains multiple formats of the same book,
        # keep track of which directories we've already looked at
        processed_subdirectories = []
        warned = False  # have we warned about no new authors setting
        matchString = ''
        for char in lazylibrarian.CONFIG['EBOOK_DEST_FILE']:
            matchString = matchString + '\\' + char
        # massage the EBOOK_DEST_FILE config parameter into something we can use
        # with regular expression matching
        booktypes = ''
        count = -1
        booktype_list = getList(lazylibrarian.CONFIG['EBOOK_TYPE'])
        for book_type in booktype_list:
            count += 1
            if count == 0:
                booktypes = book_type
            else:
                booktypes = booktypes + '|' + book_type
        matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace(
            "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']'
        pattern = re.compile(matchString, re.VERBOSE)

        for r, d, f in os.walk(startdir):
            for directory in d[:]:
                # prevent magazine being scanned
                if directory.startswith("_") or directory.startswith("."):
                    d.remove(directory)

            for files in f:
                file_count += 1

                if isinstance(r, str):
                    r = r.decode(lazylibrarian.SYS_ENCODING)

                subdirectory = r.replace(startdir, '')
                # Added new code to skip if we've done this directory before.
                # Made this conditional with a switch in config.ini
                # in case user keeps multiple different books in the same subdirectory
                if lazylibrarian.CONFIG['IMP_SINGLEBOOK'] and (subdirectory in processed_subdirectories):
                    logger.debug("[%s] already scanned" % subdirectory)
                else:
                    # If this is a book, try to get author/title/isbn/language
                    # if epub or mobi, read metadata from the book
                    # If metadata.opf exists, use that allowing it to override
                    # embedded metadata. User may have edited metadata.opf
                    # to merge author aliases together
                    # If all else fails, try pattern match for author/title
                    # and look up isbn/lang from LT or GR later
                    match = 0
                    if is_valid_booktype(files):

                        logger.debug("[%s] Now scanning subdirectory %s" % (startdir, subdirectory))

                        language = "Unknown"
                        isbn = ""
                        book = ""
                        author = ""
                        gr_id = ""
                        gb_id = ""
                        extn = os.path.splitext(files)[1]

                        # if it's an epub or a mobi we can try to read metadata from it
                        if (extn == ".epub") or (extn == ".mobi"):
                            book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING)

                            try:
                                res = get_book_info(book_filename)
                            except Exception as e:
                                logger.debug('get_book_info failed for %s, %s' % (book_filename, str(e)))
                                res = {}
                            # title and creator are the minimum we need
                            if 'title' in res and 'creator' in res:
                                book = res['title']
                                author = res['creator']
                                if book and len(book) > 2 and author and len(author) > 2:
                                    match = 1
                                if 'language' in res:
                                    language = res['language']
                                if 'identifier' in res:
                                    isbn = res['identifier']
                                if 'type' in res:
                                    extn = res['type']
                                logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" %
                                             (isbn, language, author, book, extn))
                            if not match:
                                logger.debug("Book meta incomplete in %s" % book_filename)

                        # calibre uses "metadata.opf", LL uses "bookname - authorname.opf"
                        # just look for any .opf file in the current directory since we don't know
                        # LL preferred authorname/bookname at this point.
                        # Allow metadata in file to override book contents as may be users pref

                        metafile = opf_file(r)
                        try:
                            res = get_book_info(metafile)
                        except Exception as e:
                            logger.debug('get_book_info failed for %s, %s' % (metafile, str(e)))
                            res = {}
                        # title and creator are the minimum we need
                        if 'title' in res and 'creator' in res:
                            book = res['title']
                            author = res['creator']
                            if book and len(book) > 2 and author and len(author) > 2:
                                match = 1
                            if 'language' in res:
                                language = res['language']
                            if 'identifier' in res:
                                isbn = res['identifier']
                            if 'gr_id' in res:
                                gr_id = res['gr_id']
                            logger.debug("file meta [%s] [%s] [%s] [%s] [%s]" % (isbn, language, author, book, gr_id))
                        if not match:
                            logger.debug("File meta incomplete in %s" % metafile)

                        if not match:  # no author/book from metadata file, and not embedded either
                            match = pattern.match(files)
                            if match:
                                author = match.group("author")
                                book = match.group("book")
                                if len(book) <= 2 or len(author) <= 2:
                                    match = 0
                            if not match:
                                logger.debug("Pattern match failed [%s]" % files)

                        if match:
                            # flag that we found a book in this subdirectory
                            processed_subdirectories.append(subdirectory)

                            # If we have a valid looking isbn, and language != "Unknown", add it to cache
                            if language != "Unknown" and is_valid_isbn(isbn):
                                logger.debug("Found Language [%s] ISBN [%s]" % (language, isbn))
                                # we need to add it to language cache if not already
                                # there, is_valid_isbn has checked length is 10 or 13
                                if len(isbn) == 10:
                                    isbnhead = isbn[0:3]
                                else:
                                    isbnhead = isbn[3:6]
                                match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % isbnhead)
                                if not match:
                                    myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, language))
                                    logger.debug("Cached Lang [%s] ISBN [%s]" % (language, isbnhead))
                                else:
                                    logger.debug("Already cached Lang [%s] ISBN [%s]" % (language, isbnhead))

                            author, authorid, new = addAuthorNameToDB(author)  # get the author name as we know it...

                            if author:
                                # author exists, check if this book by this author is in our database
                                # metadata might have quotes in book name
                                # some books might be stored under a different author name
                                # eg books by multiple authors, books where author is "writing as"
                                # or books we moved to "merge" authors
                                book = book.replace("'", "")

                                # First try and find it under author and bookname
                                # as we may have it under a different bookid or isbn to goodreads/googlebooks
                                # which might have several bookid/isbn for the same book
                                bookid = find_book_in_db(myDB, author, book)

                                if not bookid:
                                    # Title or author name might not match or multiple authors
                                    # See if the gr_id, gb_id is already in our database
                                    if gr_id:
                                        bookid = gr_id
                                    elif gb_id:
                                        bookid = gb_id
                                    else:
                                        bookid = ""

                                    if bookid:
                                        match = myDB.match('SELECT BookID FROM books where BookID = "%s"' % bookid)
                                        if not match:
                                            msg = 'Unable to find book %s by %s in database, trying to add it using '
                                            if bookid == gr_id:
                                                msg += "GoodReads ID " + gr_id
                                            if bookid == gb_id:
                                                msg += "GoogleBooks ID " + gb_id
                                            logger.debug(msg % (book, author))
                                            if lazylibrarian.CONFIG['BOOK_API'] == "GoodReads" and gr_id:
                                                GR_ID = GoodReads(gr_id)
                                                GR_ID.find_book(gr_id, None)
                                            elif lazylibrarian.CONFIG['BOOK_API'] == "GoogleBooks" and gb_id:
                                                GB_ID = GoogleBooks(gb_id)
                                                GB_ID.find_book(gb_id, None)
                                            # see if it's there now...
                                            match = myDB.match('SELECT BookID from books where BookID="%s"' % bookid)
                                            if not match:
                                                logger.debug("Unable to add bookid %s to database" % bookid)
                                                bookid = ""

                                if not bookid and isbn:
                                    # See if the isbn is in our database
                                    match = myDB.match('SELECT BookID FROM books where BookIsbn = "%s"' % isbn)
                                    if match:
                                        bookid = match['BookID']

                                if not bookid:
                                    # get author name from parent directory of this book directory
                                    newauthor = os.path.basename(os.path.dirname(r))
                                    # calibre replaces trailing periods with _ eg Smith Jr. -> Smith Jr_
                                    if newauthor.endswith('_'):
                                        newauthor = newauthor[:-1] + '.'
                                    if author.lower() != newauthor.lower():
                                        logger.debug("Trying authorname [%s]" % newauthor)
                                        bookid = find_book_in_db(myDB, newauthor, book)
                                        if bookid:
                                            logger.warn("%s not found under [%s], found under [%s]" %
                                                        (book, author, newauthor))

                                # at this point if we still have no bookid, it looks like we
                                # have author and book title but no database entry for it
                                if not bookid:
                                    if lazylibrarian.CONFIG['BOOK_API'] == "GoodReads":
                                        # Either goodreads doesn't have the book or it didn't match language prefs
                                        # Since we have the book anyway, try and reload it ignoring language prefs
                                        rescan_count += 1
                                        base_url = 'http://www.goodreads.com/search.xml?q='
                                        params = {"key": lazylibrarian.CONFIG['GR_API']}
                                        if author[1] in '. ':
                                            surname = author
                                            forename = ''
                                            while surname[1] in '. ':
                                                forename = forename + surname[0] + '.'
                                                surname = surname[2:].strip()
                                            if author != forename + ' ' + surname:
                                                logger.debug('Stripped authorname [%s] to [%s %s]' %
                                                            (author, forename, surname))
                                                author = forename + ' ' + surname

                                        author = ' '.join(author.split())  # ensure no extra whitespace

                                        searchname = author + ' ' + book
                                        searchname = cleanName(unaccented(searchname))
                                        searchterm = urllib.quote_plus(searchname.encode(lazylibrarian.SYS_ENCODING))
                                        set_url = base_url + searchterm + '&' + urllib.urlencode(params)
                                        try:
                                            rootxml, in_cache = get_xml_request(set_url)
                                            if not len(rootxml):
                                                logger.debug("Error requesting results from GoodReads")
                                            else:
                                                resultxml = rootxml.getiterator('work')
                                                for item in resultxml:
                                                    booktitle = item.find('./best_book/title').text
                                                    book_fuzz = fuzz.token_set_ratio(booktitle, book)
                                                    if book_fuzz >= 98:
                                                        logger.debug("Rescan found %s : %s" % (booktitle, language))
                                                        rescan_hits += 1
                                                        bookid = item.find('./best_book/id').text
                                                        GR_ID = GoodReads(bookid)
                                                        GR_ID.find_book(bookid, None)
                                                        if language and language != "Unknown":
                                                            # set language from book metadata
                                                            logger.debug("Setting language from metadata %s : %s" % (booktitle, language))
                                                            myDB.action('UPDATE books SET BookLang="%s" WHERE BookID="%s"' %
                                                                        (language, bookid))
                                                        break
                                                if not bookid:
                                                    logger.warn("GoodReads doesn't know about %s" % book)
                                        except Exception as e:
                                            logger.error("Error finding rescan results: %s" % str(e))

                                    elif lazylibrarian.CONFIG['BOOK_API'] == "GoogleBooks":
                                        # if we get here using googlebooks it's because googlebooks
                                        # doesn't have the book. No point in looking for it again.
                                        logger.warn("GoogleBooks doesn't know about %s" % book)

                                # see if it's there now...
                                if bookid:
                                    cmd = 'SELECT books.Status, BookFile, AuthorName, BookName from books,authors '
                                    cmd += 'where books.AuthorID = authors.AuthorID and BookID="%s"' % bookid
                                    check_status = myDB.match(cmd)

                                    if not check_status:
                                        logger.debug('Unable to find bookid %s in database' % bookid)
                                    else:
                                        if check_status['Status'] != 'Open':
                                            # we found a new book
                                            new_book_count += 1
                                            myDB.action(
                                                'UPDATE books set Status="Open" where BookID="%s"' % bookid)

                                        # store book location so we can check if it gets removed
                                        book_filename = os.path.join(r, files)
                                        if not check_status['BookFile']:  # no previous location
                                            myDB.action('UPDATE books set BookFile="%s" where BookID="%s"' %
                                                        (book_filename, bookid))
                                        # location may have changed since last scan
                                        elif book_filename != check_status['BookFile']:
                                            modified_count += 1
                                            logger.warn("Updating book location for %s %s from %s to %s" %
                                                        (author, book, check_status['BookFile'], book_filename))
                                            logger.debug("%s %s matched %s BookID %s, [%s][%s]" %
                                                        (author, book, check_status['Status'], bookid,
                                                        check_status['AuthorName'], check_status['BookName']))
                                            myDB.action('UPDATE books set BookFile="%s" where BookID="%s"' %
                                                        (book_filename, bookid))

                                        # update cover file to cover.jpg in book folder (if exists)
                                        bookdir = os.path.dirname(book_filename)
                                        coverimg = os.path.join(bookdir, 'cover.jpg')
                                        if os.path.isfile(coverimg):
                                            cachedir = lazylibrarian.CACHEDIR
                                            cacheimg = os.path.join(cachedir, 'book', bookid + '.jpg')
                                            copyfile(coverimg, cacheimg)
                                else:
                                    logger.warn(
                                        "Failed to match book [%s] by [%s] in database" % (book, author))
                            else:
                                if not warned and not lazylibrarian.CONFIG['ADD_AUTHOR']:
                                    logger.warn("Add authors to database is disabled")
                                    warned = True

        logger.info("%s/%s new/modified book%s found and added to the database" %
                    (new_book_count, modified_count, plural(new_book_count + modified_count)))
        logger.info("%s file%s processed" % (file_count, plural(file_count)))

        if startdir == destdir:
            # On full library scans, check for missing workpages
            setWorkPages()
            # and books with unknown language
            nolang = myDB.match(
                "select count('BookID') as counter from Books where status='Open' and BookLang='Unknown'")
            nolang = nolang['counter']
            if nolang:
                logger.warn("Found %s book%s in your library with unknown language" % (nolang, plural(nolang)))
                # show stats if new books were added
            stats = myDB.match(
                "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \
                    sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached), sum(duplicates) FROM stats")

            st= {'GR_book_hits': stats['sum(GR_book_hits)'], 'GB_book_hits': stats['sum(GR_book_hits)'],
                 'GR_lang_hits': stats['sum(GR_lang_hits)'], 'LT_lang_hits': stats['sum(LT_lang_hits)'],
                 'GB_lang_change': stats['sum(GB_lang_change)'], 'cache_hits': stats['sum(cache_hits)'],
                 'bad_lang': stats['sum(bad_lang)'], 'bad_char': stats['sum(bad_char)'],
                 'uncached': stats['sum(uncached)'], 'duplicates': stats['sum(duplicates)']}

            for item in st.keys():
                if st[item] is None:
                    st[item] = 0

            if lazylibrarian.CONFIG['BOOK_API'] == "GoogleBooks":
                logger.debug("GoogleBooks was hit %s time%s for books" %
                             (st['GR_book_hits'], plural(st['GR_book_hits'])))
                logger.debug("GoogleBooks language was changed %s time%s" %
                             (st['GB_lang_change'], plural(st['GB_lang_change'])))
            if lazylibrarian.CONFIG['BOOK_API'] == "GoodReads":
                logger.debug("GoodReads was hit %s time%s for books" %
                             (st['GR_book_hits'], plural(st['GR_book_hits'])))
                logger.debug("GoodReads was hit %s time%s for languages" %
                             (st['GR_lang_hits'], plural(st['GR_lang_hits'])))
            logger.debug("LibraryThing was hit %s time%s for languages" %
                         (st['LT_lang_hits'], plural(st['LT_lang_hits'])))
            logger.debug("Language cache was hit %s time%s" %
                         (st['cache_hits'], plural(st['cache_hits'])))
            logger.debug("Unwanted language removed %s book%s" %
                         (st['bad_lang'], plural(st['bad_lang'])))
            logger.debug("Unwanted characters removed %s book%s" %
                         (st['bad_char'], plural(st['bad_char'])))
            logger.debug("Unable to cache language for %s book%s with missing ISBN" %
                         (st['uncached'], plural(st['uncached'])))
            logger.debug("Found %s duplicate book%s" %
                         (st['duplicates'], plural(st['duplicates'])))
            logger.debug("Rescan %s hit%s, %s miss" %
                         (rescan_hits, plural(rescan_hits), rescan_count - rescan_hits))
            logger.debug("Cache %s hit%s, %s miss" %
                         (lazylibrarian.CACHE_HIT, plural(lazylibrarian.CACHE_HIT), lazylibrarian.CACHE_MISS))
            cachesize = myDB.match("select count('ISBN') as counter from languages")
            logger.debug("ISBN Language cache holds %s entries" % cachesize['counter'])

            # Cache any covers and images
            images = myDB.select('select bookid, bookimg, bookname from books where bookimg like "http%"')
            if len(images):
                logger.info("Caching cover%s for %i book%s" % (plural(len(images)), len(images), plural(len(images))))
                for item in images:
                    bookid = item['bookid']
                    bookimg = item['bookimg']
                    # bookname = item['bookname']
                    newimg, success = cache_img("book", bookid, bookimg)
                    if success:
                        myDB.action('update books set BookImg="%s" where BookID="%s"' % (newimg, bookid))

            images = myDB.select('select AuthorID, AuthorImg, AuthorName from authors where AuthorImg like "http%"')
            if len(images):
                logger.info("Caching image%s for %i author%s" % (plural(len(images)), len(images), plural(len(images))))
                for item in images:
                    authorid = item['authorid']
                    authorimg = item['authorimg']
                    # authorname = item['authorname']
                    newimg, success = cache_img("author", authorid, authorimg)
                    if success:
                        myDB.action('update authors set AuthorImg="%s" where AuthorID="%s"' % (newimg, authorid))

            # On full scan, update bookcounts for all authors, not just new ones - refresh may have located
            # new books for existing authors especially if switched provider gb/gr or changed wanted languages
            authors = myDB.select('select AuthorID from authors')
        else:
            # On single author/book import, just update bookcount for that author
            authors = myDB.select('select AuthorID from authors where AuthorName = "%s"' % author.replace('"', '""'))

        logger.debug('Updating bookcounts for %i author%s' % (len(authors), plural(len(authors))))
        for author in authors:
            update_totals(author['AuthorID'])

        logger.info('Library scan complete')
        return new_book_count

    except Exception:
        logger.error('Unhandled exception in libraryScan: %s' % traceback.format_exc())
Exemplo n.º 11
0
    def find_results(self, authorname=None, queue=None):
        resultlist = []
        api_hits = 0
        # Goodreads doesn't like initials followed by spaces,
        # eg "M L Hamilton", needs "M. L. Hamilton" or "M.L.Hamilton"
        # but DOES need spaces if not initials eg "Tom.Holt" fails, but "Tom Holt" works
        if authorname[1] == ' ':
            authorname = authorname.replace(' ', '.')
            authorname = authorname.replace('..', '.')

        url = urllib.quote_plus(authorname.encode(lazylibrarian.SYS_ENCODING))
        set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(
            self.params)
        logger.debug('Now searching GoodReads API with keyword: ' + authorname)
        logger.debug('Searching for %s at: %s' % (authorname, set_url))

        try:
            try:
                rootxml, in_cache = get_xml_request(set_url)
            except Exception as e:
                logger.error("Error finding results: %s" % e)
                return
            if not len(rootxml):
                logger.debug("Error requesting results")
                return

            resultxml = rootxml.getiterator('work')
            resultcount = 0
            for author in resultxml:
                bookdate = "0001-01-01"

                if (author.find('original_publication_year').text is None):
                    bookdate = "0000"
                else:
                    bookdate = author.find('original_publication_year').text

                authorNameResult = author.find('./best_book/author/name').text
                booksub = ""
                bookpub = ""
                booklang = "Unknown"

                try:
                    bookimg = author.find('./best_book/image_url').text
                    if (bookimg ==
                            'http://www.goodreads.com/assets/nocover/111x148.png'
                        ):
                        bookimg = 'images/nocover.png'
                except (KeyError, AttributeError):
                    bookimg = 'images/nocover.png'

                try:
                    bookrate = author.find('average_rating').text
                except KeyError:
                    bookrate = 0

                bookpages = '0'
                bookgenre = ''
                bookdesc = ''
                bookisbn = ''
                booklink = 'http://www.goodreads.com/book/show/' + author.find(
                    './best_book/id').text

                if (author.find('./best_book/title').text is None):
                    bookTitle = ""
                else:
                    bookTitle = author.find('./best_book/title').text

                author_fuzz = fuzz.token_set_ratio(authorNameResult,
                                                   authorname)
                book_fuzz = fuzz.token_set_ratio(bookTitle, authorname)
                try:
                    isbn_check = int(authorname[:-1])
                    if (len(str(isbn_check)) == 9) or (len(str(isbn_check))
                                                       == 12):
                        isbn_fuzz = int(100)
                    else:
                        isbn_fuzz = int(0)
                except:
                    isbn_fuzz = int(0)
                highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz)

                bookid = author.find('./best_book/id').text

                resultlist.append({
                    'authorname':
                    author.find('./best_book/author/name').text,
                    'bookid':
                    bookid,
                    'authorid':
                    author.find('./best_book/author/id').text,
                    'bookname':
                    bookTitle.encode("ascii", "ignore"),
                    'booksub':
                    None,
                    'bookisbn':
                    bookisbn,
                    'bookpub':
                    bookpub,
                    'bookdate':
                    bookdate,
                    'booklang':
                    booklang,
                    'booklink':
                    booklink,
                    'bookrate':
                    float(bookrate),
                    'bookimg':
                    bookimg,
                    'bookpages':
                    bookpages,
                    'bookgenre':
                    bookgenre,
                    'bookdesc':
                    bookdesc,
                    'author_fuzz':
                    author_fuzz,
                    'book_fuzz':
                    book_fuzz,
                    'isbn_fuzz':
                    isbn_fuzz,
                    'highest_fuzz':
                    highest_fuzz,
                    'num_reviews':
                    float(bookrate)
                })

                resultcount = resultcount + 1

        except urllib2.HTTPError as err:
            if err.code == 404:
                logger.error('Received a 404 error when searching for author')
            if err.code == 403:
                logger.warn('Access to api is denied: usage exceeded')
            else:
                logger.error(
                    'An unexpected error has occurred when searching for an author'
                )

        logger.debug('Found %s result%s with keyword: %s' %
                     (resultcount, plural(resultcount), authorname))
        logger.debug('The GoodReads API was hit %s time%s for keyword %s' %
                     (api_hits, plural(api_hits), authorname))

        queue.put(resultlist)
Exemplo n.º 12
0
    def find_book(self, bookid=None, queue=None):
        myDB = database.DBConnection()

        URL = 'https://www.goodreads.com/book/show/' + bookid + '?' + urllib.urlencode(self.params)

        try:
            rootxml, in_cache = get_xml_request(URL)
            if rootxml is None:
                logger.debug("Error requesting book")
                return
        except Exception as e:
            logger.error("Error finding book: %s" % str(e))
            return

        bookLanguage = rootxml.find('./book/language_code').text
        bookname = rootxml.find('./book/title').text

        if not bookLanguage:
            bookLanguage = "Unknown"
        #
        # PAB user has said they want this book, don't block for unwanted language, just warn
        #
        valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG'])
        if bookLanguage not in valid_langs:
            logger.debug('Book %s goodreads language does not match preference, %s' % (bookname, bookLanguage))

        if rootxml.find('./book/publication_year').text is None:
            bookdate = "0000"
        else:
            bookdate = rootxml.find('./book/publication_year').text

        try:
            bookimg = rootxml.find('./book/img_url').text
            if 'assets/nocover' in bookimg:
                bookimg = 'images/nocover.png'
        except (KeyError, AttributeError):
            bookimg = 'images/nocover.png'

        authorname = rootxml.find('./book/authors/author/name').text
        bookdesc = rootxml.find('./book/description').text
        bookisbn = rootxml.find('./book/isbn').text
        bookpub = rootxml.find('./book/publisher').text
        booklink = rootxml.find('./book/link').text
        bookrate = float(rootxml.find('./book/average_rating').text)
        bookpages = rootxml.find('.book/num_pages').text

        name = authorname
        GR = GoodReads(name)
        author = GR.find_author_id()
        if author:
            AuthorID = author['authorid']
            match = myDB.match('SELECT AuthorID from authors WHERE AuthorID="%s"' % AuthorID)
            if not match:
                match = myDB.match('SELECT AuthorID from authors WHERE AuthorName="%s"' %  author['authorname'])
                if match:
                    logger.debug('%s: Changing authorid from %s to %s' %
                                (author['authorname'], AuthorID, match['AuthorID']))
                    AuthorID = match['AuthorID']    # we have a different authorid for that authorname
                else:   # no author but request to add book, add author as "ignored"
                        # User hit "add book" button from a search
                    controlValueDict = {"AuthorID": AuthorID}
                    newValueDict = {
                        "AuthorName": author['authorname'],
                        "AuthorImg": author['authorimg'],
                        "AuthorLink": author['authorlink'],
                        "AuthorBorn": author['authorborn'],
                        "AuthorDeath": author['authordeath'],
                        "DateAdded": today(),
                        "Status": "Ignored"
                    }
                    myDB.upsert("authors", newValueDict, controlValueDict)
        else:
            logger.warn("No AuthorID for %s, unable to add book %s" % (authorname, bookname))
            return

        bookname = unaccented(bookname)
        bookname, booksub = split_title(authorname, bookname)
        dic = {':': '.', '"': '', '\'': ''}
        bookname = replace_all(bookname, dic).strip()
        booksub = replace_all(booksub, dic).strip()
        if booksub:
            series, seriesNum = bookSeries(booksub)
        else:
            series, seriesNum = bookSeries(bookname)

        controlValueDict = {"BookID": bookid}
        newValueDict = {
            "AuthorID": AuthorID,
            "BookName": bookname,
            "BookSub": booksub,
            "BookDesc": bookdesc,
            "BookIsbn": bookisbn,
            "BookPub": bookpub,
            "BookGenre": "",
            "BookImg": bookimg,
            "BookLink": booklink,
            "BookRate": bookrate,
            "BookPages": bookpages,
            "BookDate": bookdate,
            "BookLang": bookLanguage,
            "Status": "Wanted",
            "BookAdded": today()
        }

        myDB.upsert("books", newValueDict, controlValueDict)
        logger.info("%s added to the books database" % bookname)

        if 'nocover' in bookimg or 'nophoto' in bookimg:
            # try to get a cover from librarything
            workcover = getBookCover(bookid)
            if workcover:
                logger.debug(u'Updated cover for %s to %s' % (bookname, workcover))
                controlValueDict = {"BookID": bookid}
                newValueDict = {"BookImg": workcover}
                myDB.upsert("books", newValueDict, controlValueDict)

        elif bookimg and bookimg.startswith('http'):
            link, success = cache_img("book", bookid, bookimg)
            if success:
                controlValueDict = {"BookID": bookid}
                newValueDict = {"BookImg": link}
                myDB.upsert("books", newValueDict, controlValueDict)
            else:
                logger.debug('Failed to cache image for %s' % bookimg)

        if lazylibrarian.CONFIG['ADD_SERIES']:
            # prefer series info from librarything
            seriesdict = getWorkSeries(bookid)
            if seriesdict:
                logger.debug(u'Updated series: %s [%s]' % (bookid, seriesdict))
            else:
                if series:
                    seriesdict = {cleanName(unaccented(series)): seriesNum}
            setSeries(seriesdict, bookid)

        worklink = getWorkPage(bookid)
        if worklink:
            controlValueDict = {"BookID": bookid}
            newValueDict = {"WorkPage": worklink}
            myDB.upsert("books", newValueDict, controlValueDict)
Exemplo n.º 13
0
    def find_results(self, searchterm=None, queue=None):
        try:
            resultlist = []
            api_hits = 0
            # we don't use the title/author separator in goodreads
            searchterm = searchterm.replace(' <ll> ', '')

            url = urllib.quote_plus(searchterm.encode(lazylibrarian.SYS_ENCODING))
            set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params)
            logger.debug('Now searching GoodReads API with searchterm: %s' % searchterm)
            #logger.debug('Searching for %s at: %s' % (searchterm, set_url))

            resultcount = 0
            try:
                try:
                    rootxml, in_cache = get_xml_request(set_url)
                except Exception as e:
                    logger.error("Error finding gr results: %s" % str(e))
                    return
                if not len(rootxml):
                    logger.debug("Error requesting results")
                    return

                resultxml = rootxml.getiterator('work')
                for author in resultxml:

                    if author.find('original_publication_year').text is None:
                        bookdate = "0000"
                    else:
                        bookdate = author.find('original_publication_year').text

                    authorNameResult = author.find('./best_book/author/name').text
                    # Goodreads sometimes puts extra whitepase in the author names!
                    authorNameResult =  ' '.join(authorNameResult.split())
                    booksub = ""
                    bookpub = ""
                    booklang = "Unknown"

                    try:
                        bookimg = author.find('./best_book/image_url').text
                        if bookimg == 'http://www.goodreads.com/assets/nocover/111x148.png':
                            bookimg = 'images/nocover.png'
                    except (KeyError, AttributeError):
                        bookimg = 'images/nocover.png'

                    try:
                        bookrate = author.find('average_rating').text
                    except KeyError:
                        bookrate = 0

                    bookpages = '0'
                    bookgenre = ''
                    bookdesc = ''
                    bookisbn = ''
                    booklink = 'http://www.goodreads.com/book/show/' + author.find('./best_book/id').text

                    if author.find('./best_book/title').text is None:
                        bookTitle = ""
                    else:
                        bookTitle = author.find('./best_book/title').text

                    author_fuzz = fuzz.ratio(authorNameResult, searchterm)
                    book_fuzz = fuzz.ratio(bookTitle, searchterm)
                    isbn_fuzz = 0
                    if is_valid_isbn(searchterm):
                            isbn_fuzz = 100

                    highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz)

                    bookid = author.find('./best_book/id').text

                    resultlist.append({
                        'authorname': author.find('./best_book/author/name').text,
                        'bookid': bookid,
                        'authorid': author.find('./best_book/author/id').text,
                        'bookname': bookTitle.encode("ascii", "ignore"),
                        'booksub': booksub,
                        'bookisbn': bookisbn,
                        'bookpub': bookpub,
                        'bookdate': bookdate,
                        'booklang': booklang,
                        'booklink': booklink,
                        'bookrate': float(bookrate),
                        'bookimg': bookimg,
                        'bookpages': bookpages,
                        'bookgenre': bookgenre,
                        'bookdesc': bookdesc,
                        'author_fuzz': author_fuzz,
                        'book_fuzz': book_fuzz,
                        'isbn_fuzz': isbn_fuzz,
                        'highest_fuzz': highest_fuzz,
                        'num_reviews': float(bookrate)
                    })

                    resultcount += 1

            except urllib2.HTTPError as err:
                if err.code == 404:
                    logger.error('Received a 404 error when searching for author')
                if err.code == 403:
                    logger.warn('Access to api is denied: usage exceeded')
                else:
                    logger.error('An unexpected error has occurred when searching for an author: %s' % str(err))

            logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), searchterm))
            logger.debug(
                'The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), searchterm))

            queue.put(resultlist)

        except Exception:
            logger.error('Unhandled exception in GR.find_results: %s' % traceback.format_exc())
Exemplo n.º 14
0
    def get_author_books(self, authorid=None, authorname=None, bookstatus="Skipped", refresh=False):
        try:
            api_hits = 0
            gr_lang_hits = 0
            lt_lang_hits = 0
            gb_lang_change = 0
            cache_hits = 0
            not_cached = 0
            URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + urllib.urlencode(self.params)

            # Artist is loading
            myDB = database.DBConnection()
            controlValueDict = {"AuthorID": authorid}
            newValueDict = {"Status": "Loading"}
            myDB.upsert("authors", newValueDict, controlValueDict)

            try:
                rootxml, in_cache = get_xml_request(URL, useCache=not refresh)
            except Exception as e:
                logger.error("Error fetching author books: %s" % str(e))
                return
            if rootxml is None:
                logger.debug("Error requesting author books")
                return
            if not in_cache:
                api_hits += 1
            resultxml = rootxml.getiterator('book')

            valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG'])

            resultsCount = 0
            removedResults = 0
            duplicates = 0
            ignored = 0
            added_count = 0
            updated_count = 0
            book_ignore_count = 0
            total_count = 0

            if not len(resultxml):
                logger.warn('[%s] No books found for author with ID: %s' % (authorname, authorid))
            else:
                logger.debug("[%s] Now processing books with GoodReads API" % authorname)
                logger.debug(u"url " + URL)

                authorNameResult = rootxml.find('./author/name').text
                # Goodreads sometimes puts extra whitepase in the author names!
                authorNameResult =  ' '.join(authorNameResult.split())
                logger.debug(u"GoodReads author name [%s]" % authorNameResult)
                loopCount = 1

                while resultxml:
                    for book in resultxml:
                        total_count += 1

                        if book.find('publication_year').text is None:
                            pubyear = "0000"
                        else:
                            pubyear = book.find('publication_year').text

                        try:
                            bookimg = book.find('image_url').text
                            if 'nocover' in bookimg:
                                bookimg = 'images/nocover.png'
                        except (KeyError, AttributeError):
                            bookimg = 'images/nocover.png'

                        bookLanguage = "Unknown"
                        find_field = "id"
                        isbn = ""
                        isbnhead = ""
                        if "All" not in valid_langs:  # do we care about language
                            if book.find('isbn').text:
                                find_field = "isbn"
                                isbn = book.find('isbn').text
                                isbnhead = isbn[0:3]
                            else:
                                if book.find('isbn13').text:
                                    find_field = "isbn13"
                                    isbn = book.find('isbn13').text
                                    isbnhead = isbn[3:6]
                            # Try to use shortcut of ISBN identifier codes described here...
                            # https://en.wikipedia.org/wiki/List_of_ISBN_identifier_groups
                            if isbnhead:
                                if find_field == "isbn13" and isbn.startswith('979'):
                                    for item in lazylibrarian.isbn_979_dict:
                                        if isbnhead.startswith(item):
                                            bookLanguage = lazylibrarian.isbn_979_dict[item]
                                            break
                                    if bookLanguage != "Unknown":
                                        logger.debug("ISBN979 returned %s for %s" % (bookLanguage, isbnhead))
                                elif (find_field == "isbn") or (find_field == "isbn13" and isbn.startswith('978')):
                                    for item in lazylibrarian.isbn_978_dict:
                                        if isbnhead.startswith(item):
                                            bookLanguage = lazylibrarian.isbn_978_dict[item]
                                            break
                                    if bookLanguage != "Unknown":
                                        logger.debug("ISBN978 returned %s for %s" % (bookLanguage, isbnhead))

                            if bookLanguage == "Unknown" and isbnhead:
                                # Nothing in the isbn dictionary, try any cached results
                                match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % isbnhead)
                                if match:
                                    bookLanguage = match['lang']
                                    cache_hits += 1
                                    logger.debug("Found cached language [%s] for %s [%s]" %
                                                 (bookLanguage, find_field, isbnhead))
                                else:
                                    # no match in cache, try searching librarything for a language code using the isbn
                                    # if no language found, librarything return value is "invalid" or "unknown"
                                    # returns plain text, not xml
                                    BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + isbn
                                    try:
                                        librarything_wait()
                                        resp = urllib2.urlopen(BOOK_URL, timeout=30).read()
                                        lt_lang_hits += 1
                                        logger.debug("LibraryThing reports language [%s] for %s" % (resp, isbnhead))

                                        if 'invalid' in resp or 'Unknown' in resp:
                                            bookLanguage = "Unknown"
                                        else:
                                            bookLanguage = resp  # found a language code
                                            myDB.action('insert into languages values ("%s", "%s")' %
                                                        (isbnhead, bookLanguage))
                                            logger.debug(u"LT language %s: %s" % (isbnhead, bookLanguage))
                                    except Exception as e:
                                        logger.error("Error finding LT language result for [%s], %s" % (isbn, str(e)))

                            if bookLanguage == "Unknown":
                                # still  no earlier match, we'll have to search the goodreads api
                                try:
                                    if book.find(find_field).text:
                                        BOOK_URL = 'http://www.goodreads.com/book/show?id=' + \
                                                   book.find(find_field).text + '&' + urllib.urlencode(self.params)
                                        logger.debug(u"Book URL: " + BOOK_URL)

                                        time_now = int(time.time())
                                        if time_now <= lazylibrarian.LAST_GOODREADS:
                                            time.sleep(1)

                                        bookLanguage = ""
                                        try:
                                            BOOK_rootxml, in_cache = get_xml_request(BOOK_URL)
                                            if BOOK_rootxml is None:
                                                logger.debug('Error requesting book language code')
                                            else:
                                                if not in_cache:
                                                    # only update last_goodreads if the result wasn't found in the cache
                                                    lazylibrarian.LAST_GOODREADS = time_now
                                                try:
                                                    bookLanguage = BOOK_rootxml.find('./book/language_code').text
                                                except Exception as e:
                                                    logger.debug("Error finding language_code in book xml: %s" % str(e))
                                        except Exception as e:
                                            logger.debug("Error getting book xml: %s" % str(e))

                                        if not in_cache:
                                            gr_lang_hits += 1
                                        if not bookLanguage:
                                            bookLanguage = "Unknown"
                                            # At this point, give up?
                                            # WhatWork on author/title doesn't give us a language.
                                            # It might give us the "original language" of the book (but not always)
                                            # and our copy might not be in the original language anyway
                                            # eg "The Girl With the Dragon Tattoo" original language Swedish
                                            # If we have an isbn, try WhatISBN to get alternatives
                                            # in case any of them give us a language, but it seems if thinglang doesn't
                                            # have a language for the first isbn code, it doesn't for any of the
                                            # alternatives either
                                            # Goodreads search results don't include the language. Although sometimes
                                            # it's in the html page, it's not in the xml results

                                        if isbnhead != "":
                                            # if GR didn't give an isbn we can't cache it, just use language for this book
                                            myDB.action('insert into languages values ("%s", "%s")' %
                                                        (isbnhead, bookLanguage))
                                            logger.debug("GoodReads reports language [%s] for %s" %
                                                         (bookLanguage, isbnhead))
                                        else:
                                            not_cached += 1

                                        logger.debug(u"GR language: " + bookLanguage)
                                    else:
                                        logger.debug("No %s provided for [%s]" % (find_field, book.find('title').text))
                                        # continue

                                except Exception as e:
                                    logger.debug(u"Goodreads language search failed: %s" % str(e))

                            if bookLanguage not in valid_langs:
                                logger.debug('Skipped %s with language %s' % (book.find('title').text, bookLanguage))
                                ignored += 1
                                continue

                        bookname = book.find('title').text
                        bookid = book.find('id').text
                        bookdesc = book.find('description').text
                        bookisbn = book.find('isbn').text
                        bookpub = book.find('publisher').text
                        booklink = book.find('link').text
                        bookrate = float(book.find('average_rating').text)
                        bookpages = book.find('num_pages').text
                        bookname = unaccented(bookname)

                        bookname, booksub = split_title(authorNameResult, bookname)

                        dic = {':': '.', '"': ''}  # do we need to strip apostrophes , '\'': ''}
                        bookname = replace_all(bookname, dic)
                        bookname = bookname.strip()  # strip whitespace
                        booksub = replace_all(booksub, dic)
                        booksub = booksub.strip()  # strip whitespace
                        if booksub:
                            series, seriesNum = bookSeries(booksub)
                        else:
                            series, seriesNum = bookSeries(bookname)

                        rejected = False
                        check_status = False

                        if re.match('[^\w-]', bookname):  # reject books with bad characters in title
                            logger.debug(u"removed result [" + bookname + "] for bad characters")
                            removedResults += 1
                            rejected = True

                        if not rejected and lazylibrarian.CONFIG['NO_FUTURE']:
                            if pubyear > today()[:4]:
                                logger.debug('Rejecting %s, future publication date %s' % (bookname, pubyear))
                                removedResults += 1
                                rejected = True

                        if not rejected and not bookname:
                            logger.debug('Rejecting bookid %s for %s, no bookname' %
                                         (bookid, authorNameResult))
                            removedResults += 1
                            rejected = True

                        if not rejected:
                            cmd = 'SELECT BookID FROM books,authors WHERE books.AuthorID = authors.AuthorID'
                            cmd += ' and BookName = "%s" COLLATE NOCASE and AuthorName = "%s" COLLATE NOCASE' % \
                                    (bookname, authorNameResult.replace('"', '""'))
                            match = myDB.match(cmd)
                            if match:
                                if match['BookID'] != bookid:
                                    # we have a different book with this author/title already
                                    logger.debug('Rejecting bookid %s for [%s][%s] already got %s' %
                                                 (match['BookID'], authorNameResult, bookname, bookid))
                                    duplicates += 1
                                    rejected = True

                        if not rejected:
                            cmd = 'SELECT AuthorName,BookName FROM books,authors'
                            cmd += ' WHERE authors.AuthorID = books.AuthorID AND BookID=%s' % bookid
                            match = myDB.match(cmd)
                            if match:
                                # we have a book with this bookid already
                                if bookname != match['BookName'] or authorNameResult != match['AuthorName']:
                                    logger.debug('Rejecting bookid %s for [%s][%s] already got bookid for [%s][%s]' %
                                                 (bookid, authorNameResult, bookname,
                                                 match['AuthorName'], match['BookName']))
                                else:
                                    logger.debug('Rejecting bookid %s for [%s][%s] already got this book in database' %
                                                 (bookid, authorNameResult, bookname))
                                    check_status = True
                                duplicates += 1
                                rejected = True

                        if check_status or not rejected:
                            existing_book = myDB.match('SELECT Status,Manual FROM books WHERE BookID = "%s"' % bookid)
                            if existing_book:
                                book_status = existing_book['Status']
                                locked = existing_book['Manual']
                                if locked is None:
                                    locked = False
                                elif locked.isdigit():
                                    locked = bool(int(locked))
                            else:
                                book_status = bookstatus  # new_book status, or new_author status
                                locked = False

                            # Is the book already in the database?
                            # Leave alone if locked or status "ignore"
                            if not locked and book_status != "Ignored":
                                controlValueDict = {"BookID": bookid}
                                newValueDict = {
                                    "AuthorID": authorid,
                                    "BookName": bookname,
                                    "BookSub": booksub,
                                    "BookDesc": bookdesc,
                                    "BookIsbn": bookisbn,
                                    "BookPub": bookpub,
                                    "BookGenre": "",
                                    "BookImg": bookimg,
                                    "BookLink": booklink,
                                    "BookRate": bookrate,
                                    "BookPages": bookpages,
                                    "BookDate": pubyear,
                                    "BookLang": bookLanguage,
                                    "Status": book_status,
                                    "BookAdded": today()
                                }

                                resultsCount += 1
                                updated = False

                                myDB.upsert("books", newValueDict, controlValueDict)
                                logger.debug(u"Book found: " + book.find('title').text + " " + pubyear)

                                if 'nocover' in bookimg or 'nophoto' in bookimg:
                                    # try to get a cover from librarything
                                    workcover = getBookCover(bookid)
                                    if workcover:
                                        logger.debug(u'Updated cover for %s to %s' % (bookname, workcover))
                                        controlValueDict = {"BookID": bookid}
                                        newValueDict = {"BookImg": workcover}
                                        myDB.upsert("books", newValueDict, controlValueDict)
                                        updated = True

                                elif bookimg and bookimg.startswith('http'):
                                    link, success = cache_img("book", bookid, bookimg, refresh=refresh)
                                    if success:
                                        controlValueDict = {"BookID": bookid}
                                        newValueDict = {"BookImg": link}
                                        myDB.upsert("books", newValueDict, controlValueDict)
                                        updated = True
                                    else:
                                        logger.debug('Failed to cache image for %s' % bookimg)

                                seriesdict = {}
                                if lazylibrarian.CONFIG['ADD_SERIES']:
                                    # prefer series info from librarything
                                    seriesdict = getWorkSeries(bookid)
                                    if seriesdict:
                                        logger.debug(u'Updated series: %s [%s]' % (bookid, seriesdict))
                                        updated = True
                                    else:
                                        if series:
                                            seriesdict = {cleanName(unaccented(series)): seriesNum}
                                    setSeries(seriesdict, bookid)

                                new_status = setStatus(bookid, seriesdict, bookstatus)

                                if not new_status == book_status:
                                    book_status = new_status
                                    updated = True

                                worklink = getWorkPage(bookid)
                                if worklink:
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {"WorkPage": worklink}
                                    myDB.upsert("books", newValueDict, controlValueDict)

                                if not existing_book:
                                    logger.debug(u"[%s] Added book: %s [%s] status %s" %
                                                (authorname, bookname, bookLanguage, book_status))
                                    added_count += 1
                                elif updated:
                                    logger.debug(u"[%s] Updated book: %s [%s] status %s" %
                                                (authorname, bookname, bookLanguage, book_status))
                                    updated_count += 1
                            else:
                                book_ignore_count += 1

                    loopCount += 1
                    URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + \
                          urllib.urlencode(self.params) + '&page=' + str(loopCount)
                    resultxml = None
                    try:
                        rootxml, in_cache = get_xml_request(URL, useCache=not refresh)
                        if rootxml is None:
                            logger.debug('Error requesting next page of results')
                        else:
                            resultxml = rootxml.getiterator('book')
                            if not in_cache:
                                api_hits += 1
                    except Exception as e:
                        resultxml = None
                        logger.error("Error finding next page of results: %s" % str(e))

                    if resultxml:
                        if all(False for _ in resultxml):  # returns True if iterator is empty
                            resultxml = None

            deleteEmptySeries()
            lastbook = myDB.match('SELECT BookName, BookLink, BookDate, BookImg from books WHERE AuthorID="%s" \
                                AND Status != "Ignored" order by BookDate DESC' % authorid)
            if lastbook:
                lastbookname = lastbook['BookName']
                lastbooklink = lastbook['BookLink']
                lastbookdate = lastbook['BookDate']
                lastbookimg = lastbook['BookImg']
            else:
                lastbookname = ""
                lastbooklink = ""
                lastbookdate = ""
                lastbookimg = ""

            controlValueDict = {"AuthorID": authorid}
            newValueDict = {
                "Status": "Active",
                "LastBook": lastbookname,
                "LastLink": lastbooklink,
                "LastDate": lastbookdate,
                "LastBookImg": lastbookimg
            }
            myDB.upsert("authors", newValueDict, controlValueDict)

            # This is here because GoodReads sometimes has several entries with the same BookID!
            modified_count = added_count + updated_count

            logger.debug("Found %s result%s" % (total_count, plural(total_count)))
            logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored)))
            logger.debug(
                "Removed %s bad character or no-name result%s" %
                (removedResults, plural(removedResults)))
            logger.debug("Removed %s duplicate result%s" % (duplicates, plural(duplicates)))
            logger.debug("Found %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count)))
            logger.debug("Imported/Updated %s book%s" % (modified_count, plural(modified_count)))

            myDB.action('insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' %
                        (authorname.replace('"', '""'), api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change,
                         cache_hits, ignored, removedResults, not_cached, duplicates))

            if refresh:
                logger.info("[%s] Book processing complete: Added %s book%s / Updated %s book%s" %
                            (authorname, added_count, plural(added_count), updated_count, plural(updated_count)))
            else:
                logger.info("[%s] Book processing complete: Added %s book%s to the database" %
                            (authorname, added_count, plural(added_count)))

        except Exception:
            logger.error('Unhandled exception in GR.get_author_books: %s' % traceback.format_exc())
Exemplo n.º 15
0
    def find_results(self, searchterm=None, queue=None):
        try:
            resultlist = []
            api_hits = 0
            searchtitle = ''
            searchauthorname = ''

            if ' <ll> ' in searchterm:  # special token separates title from author
                searchtitle, searchauthorname = searchterm.split(' <ll> ')
                searchterm = searchterm.replace(' <ll> ', ' ')

            searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING)
            url = urllib.quote_plus(searchterm)
            set_url = 'https://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params)
            logger.debug('Now searching GoodReads API with searchterm: %s' % searchterm)
            # logger.debug('Searching for %s at: %s' % (searchterm, set_url))

            resultcount = 0
            try:
                try:
                    rootxml, in_cache = get_xml_request(set_url)
                except Exception as e:
                    logger.error("%s finding gr results: %s" % (type(e).__name__, str(e)))
                    return
                if rootxml is None:
                    logger.debug("Error requesting results")
                    return

                totalresults = check_int(rootxml.find('search/total-results').text, 0)

                resultxml = rootxml.getiterator('work')
                loopCount = 1
                while resultxml:
                    for author in resultxml:
                        try:
                            if author.find('original_publication_year').text is None:
                                bookdate = "0000"
                            else:
                                bookdate = author.find('original_publication_year').text
                        except (KeyError, AttributeError):
                            bookdate = "0000"

                        try:
                            authorNameResult = author.find('./best_book/author/name').text
                            # Goodreads sometimes puts extra whitepase in the author names!
                            authorNameResult = ' '.join(authorNameResult.split())
                        except (KeyError, AttributeError):
                            authorNameResult = ""

                        booksub = ""
                        bookpub = ""
                        booklang = "Unknown"

                        try:
                            bookimg = author.find('./best_book/image_url').text
                            if bookimg == 'https://www.goodreads.com/assets/nocover/111x148.png':
                                bookimg = 'images/nocover.png'
                        except (KeyError, AttributeError):
                            bookimg = 'images/nocover.png'

                        try:
                            bookrate = author.find('average_rating').text
                        except KeyError:
                            bookrate = 0

                        bookpages = '0'
                        bookgenre = ''
                        bookdesc = ''
                        bookisbn = ''

                        try:
                            booklink = 'https://www.goodreads.com/book/show/' + author.find('./best_book/id').text
                        except (KeyError, AttributeError):
                            booklink = ""

                        try:
                            authorid = author.find('./best_book/author/id').text
                        except (KeyError, AttributeError):
                            authorid = ""

                        try:
                            if author.find('./best_book/title').text is None:
                                bookTitle = ""
                            else:
                                bookTitle = author.find('./best_book/title').text
                        except (KeyError, AttributeError):
                            bookTitle = ""

                        if searchauthorname:
                            author_fuzz = fuzz.ratio(authorNameResult, searchauthorname)
                        else:
                            author_fuzz = fuzz.ratio(authorNameResult, searchterm)
                        if searchtitle:
                            book_fuzz = fuzz.token_set_ratio(bookTitle, searchtitle)
                            # lose a point for each extra word in the fuzzy matches so we get the closest match
                            words = len(getList(bookTitle))
                            words -= len(getList(searchtitle))
                            book_fuzz -= abs(words)
                        else:
                            book_fuzz = fuzz.token_set_ratio(bookTitle, searchterm)
                            words = len(getList(bookTitle))
                            words -= len(getList(searchterm))
                            book_fuzz -= abs(words)
                        isbn_fuzz = 0
                        if is_valid_isbn(searchterm):
                            isbn_fuzz = 100

                        highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz)

                        try:
                            bookid = author.find('./best_book/id').text
                        except (KeyError, AttributeError):
                            bookid = ""

                        resultlist.append({
                            'authorname': authorNameResult,
                            'bookid': bookid,
                            'authorid': authorid,
                            'bookname': bookTitle,
                            'booksub': booksub,
                            'bookisbn': bookisbn,
                            'bookpub': bookpub,
                            'bookdate': bookdate,
                            'booklang': booklang,
                            'booklink': booklink,
                            'bookrate': float(bookrate),
                            'bookimg': bookimg,
                            'bookpages': bookpages,
                            'bookgenre': bookgenre,
                            'bookdesc': bookdesc,
                            'author_fuzz': author_fuzz,
                            'book_fuzz': book_fuzz,
                            'isbn_fuzz': isbn_fuzz,
                            'highest_fuzz': highest_fuzz,
                            'num_reviews': float(bookrate)
                        })

                        resultcount += 1

                    loopCount += 1

                    if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < loopCount:
                        resultxml = None
                        logger.warn('Maximum results page search reached, still more results available')
                    elif totalresults and resultcount >= totalresults:
                        # fix for goodreads bug on isbn searches
                        resultxml = None
                    else:
                        URL = set_url + '&page=' + str(loopCount)
                        resultxml = None
                        try:
                            rootxml, in_cache = get_xml_request(URL)
                            if rootxml is None:
                                logger.debug('Error requesting page %s of results' % loopCount)
                            else:
                                resultxml = rootxml.getiterator('work')
                                if not in_cache:
                                    api_hits += 1
                        except Exception as e:
                            resultxml = None
                            logger.error("%s finding page %s of results: %s" % (type(e).__name__, loopCount, str(e)))

                    if resultxml:
                        if all(False for _ in resultxml):  # returns True if iterator is empty
                            resultxml = None

            except Exception as err:
                if err.code == 404:
                    logger.error('Received a 404 error when searching for author')
                if err.code == 403:
                    logger.warn('Access to api is denied: usage exceeded')
                else:
                    logger.error('An unexpected error has occurred when searching for an author: %s' % str(err))

            logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), searchterm))
            logger.debug(
                'The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), searchterm))

            queue.put(resultlist)

        except Exception:
            logger.error('Unhandled exception in GR.find_results: %s' % traceback.format_exc())
Exemplo n.º 16
0
    def get_author_books(self, authorid=None, authorname=None, refresh=False):

        api_hits = 0
        gr_lang_hits = 0
        lt_lang_hits = 0
        gb_lang_change = 0
        cache_hits = 0
        not_cached = 0
        URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + urllib.urlencode(
            self.params)

        # Artist is loading
        myDB = database.DBConnection()
        controlValueDict = {"AuthorID": authorid}
        newValueDict = {"Status": "Loading"}
        myDB.upsert("authors", newValueDict, controlValueDict)
        books_dict = []
        try:
            rootxml, in_cache = get_xml_request(URL, useCache=not refresh)
        except Exception as e:
            logger.error("Error fetching author books: %s" % e)
            return books_dict
        if rootxml is None:
            logger.debug("Error requesting author books")
            return books_dict
        if not in_cache:
            api_hits = api_hits + 1
        resultxml = rootxml.getiterator('book')

        valid_langs = ([
            valid_lang.strip()
            for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')
        ])

        if not len(resultxml):
            logger.warn('[%s] No books found for author with ID: %s' %
                        (authorname, authorid))
        else:
            logger.debug("[%s] Now processing books with GoodReads API" %
                         authorname)

            resultsCount = 0
            removedResults = 0
            duplicates = 0
            ignored = 0
            added_count = 0
            updated_count = 0
            book_ignore_count = 0
            total_count = 0
            logger.debug(u"url " + URL)

            authorNameResult = rootxml.find('./author/name').text
            logger.debug(u"author name " + authorNameResult)
            loopCount = 1

            while resultxml is not None:
                for book in resultxml:
                    total_count = total_count + 1

                    if (book.find('publication_year').text is None):
                        pubyear = "0000"
                    else:
                        pubyear = book.find('publication_year').text

                    try:
                        bookimg = book.find('image_url').text
                        if ('nocover' in bookimg):
                            bookimg = 'images/nocover.png'
                    except (KeyError, AttributeError):
                        bookimg = 'images/nocover.png'

    # PAB this next section tries to get the book language using the isbn13 to look it up. If no isbn13 we skip the
    # book entirely, rather than including it with an "Unknown" language. Changed this so we can still include the book
    # with language set to "Unknown". There is a setting in config.ini to allow or skip books with "Unknown" language
    # if you really don't want to include them.
    # Not all GR books have isbn13 filled in, but all have a GR bookid, which we've already got, so use that.
    # Also, with GR API rules we can only call the API once per second, which slows us down a lot when all we want
    # is to get the language. We sleep for one second per book that GR knows about for each author you have in your
    # library. The libraryThing API has the same 1 second restriction, and is limited to 1000 hits per day, but has
    # fewer books with unknown language. To get around this and speed up the process, see if we already have a book
    # in the database with a similar start to the ISBN. The way ISBNs work, digits 3-5 of a 13 char ISBN or digits 0-2
    # of a 10 digit ISBN indicate the region/language so if two books have the same 3 digit isbn code, they _should_
    # be the same language.
    # I ran a simple python script on my library of 1500 books, and these codes were 100% correct on matching book
    # languages, no mis-matches. It did result in a small number of books with "unknown" language being wrongly matched
    # but most "unknown" were matched to the correct language.
    # We could look up ISBNs we already know about in the database, but this only holds books in the languages we want
    # to keep, which reduces the number of cache hits, so we create a new database table, holding ALL results including
    # the ISBNs for languages we don't want and books we reject.
    # The new table is created (if not exists) in init.py so by the time we get here there is an existing table.
    # If we haven't an already matching partial ISBN, look up language code from libraryThing
    # "http://www.librarything.com/api/thingLang.php?isbn=1234567890"
    # If you find a matching language, add it to the database.  If "unknown" or "invalid", try GR as maybe GR can
    # provide a match.
    # If both LT and GR return unknown, add isbn to db as "unknown". No point in repeatedly asking LT for a code
    # it's told you it doesn't know.
    # As an extra option, if language includes "All" in config.ini, we can skip this whole section and process
    # everything much faster by not querying for language at all.
    # It does mean we include a lot of unwanted foreign translations in the database, but it's _much_ faster.

                    bookLanguage = "Unknown"
                    find_field = "id"
                    isbn = ""
                    isbnhead = ""
                    if "All" not in valid_langs:  # do we care about language
                        if (book.find('isbn').text is not None):
                            find_field = "isbn"
                            isbn = book.find('isbn').text
                            isbnhead = isbn[0:3]
                        else:
                            if (book.find('isbn13').text is not None):
                                find_field = "isbn13"
                                isbn = book.find('isbn13').text
                                isbnhead = isbn[3:6]
                        if (find_field != 'id'):  # isbn or isbn13 found

                            match = myDB.action(
                                'SELECT lang FROM languages where isbn = "%s"'
                                % (isbnhead)).fetchone()
                            if (match):
                                bookLanguage = match['lang']
                                cache_hits = cache_hits + 1
                                logger.debug(
                                    "Found cached language [%s] for %s [%s]" %
                                    (bookLanguage, find_field, isbnhead))
                            else:
                                # no match in cache, try searching librarything for a language code using the isbn
                                # if no language found, librarything return value is "invalid" or "unknown"
                                # returns plain text, not xml
                                BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + isbn
                                try:
                                    librarything_wait()
                                    resp = urllib2.urlopen(BOOK_URL,
                                                           timeout=30).read()
                                    lt_lang_hits = lt_lang_hits + 1
                                    logger.debug(
                                        "LibraryThing reports language [%s] for %s"
                                        % (resp, isbnhead))

                                    if ('invalid' in resp
                                            or 'Unknown' in resp):
                                        find_field = "id"  # reset the field to force search on goodreads
                                    else:
                                        bookLanguage = resp  # found a language code
                                        myDB.action(
                                            'insert into languages values ("%s", "%s")'
                                            % (isbnhead, bookLanguage))
                                        logger.debug(u"LT language %s: %s" %
                                                     (isbnhead, bookLanguage))
                                except Exception as e:
                                    logger.error(
                                        "Error finding LT language result for [%s], %s"
                                        % (isbn, e))
                                    find_field = "id"  # reset the field to search on goodreads

                        if (find_field == 'id'):
                            # [or bookLanguage == "Unknown"] no earlier match, we'll have to search the goodreads api
                            try:
                                if (book.find(find_field).text is not None):
                                    BOOK_URL = 'http://www.goodreads.com/book/show?id=' + \
                                        book.find(find_field).text + '&' + urllib.urlencode(self.params)
                                    logger.debug(u"Book URL: " + BOOK_URL)

                                    try:
                                        time_now = int(time.time())
                                        if time_now <= lazylibrarian.LAST_GOODREADS:
                                            time.sleep(1)

                                        BOOK_rootxml, in_cache = get_xml_request(
                                            BOOK_URL)
                                        if BOOK_rootxml is None:
                                            logger.debug(
                                                'Error requesting book language code'
                                            )
                                            bookLanguage = ""
                                        else:
                                            if not in_cache:
                                                # only update last_goodreads if the result wasn't found in the cache
                                                lazylibrarian.LAST_GOODREADS = time_now
                                            bookLanguage = BOOK_rootxml.find(
                                                './book/language_code').text
                                    except Exception as e:
                                        logger.error(
                                            "Error finding book results: %s" %
                                            e)
                                    if not in_cache:
                                        gr_lang_hits = gr_lang_hits + 1
                                    if not bookLanguage:
                                        bookLanguage = "Unknown"

                                    if (isbnhead != ""):
                                        # GR didn't give an isbn so we can't cache it, just use language for this book
                                        myDB.action(
                                            'insert into languages values ("%s", "%s")'
                                            % (isbnhead, bookLanguage))
                                        logger.debug(
                                            "GoodReads reports language [%s] for %s"
                                            % (bookLanguage, isbnhead))
                                    else:
                                        not_cached = not_cached + 1

                                    logger.debug(u"GR language: " +
                                                 bookLanguage)
                                else:
                                    logger.debug(
                                        "No %s provided for [%s]" %
                                        (find_field, book.find('title').text))
                                    # continue

                            except Exception as e:
                                logger.debug(u"An error has occured: %s" % e)

                        if bookLanguage not in valid_langs:
                            logger.debug('Skipped a book with language %s' %
                                         bookLanguage)
                            ignored = ignored + 1
                            continue
                    bookname = book.find('title').text
                    bookid = book.find('id').text
                    bookdesc = book.find('description').text
                    bookisbn = book.find('isbn').text
                    bookpub = book.find('publisher').text
                    booklink = book.find('link').text
                    bookrate = float(book.find('average_rating').text)
                    bookpages = book.find('num_pages').text
                    bookname = unaccented(bookname)
                    if ': ' in bookname:
                        parts = bookname.split(': ', 1)
                        bookname = parts[0]
                        booksub = parts[1]
                    else:
                        booksub = ''
                    dic = {':': '', '"': '', '\'': ''}
                    bookname = replace_all(bookname, dic)
                    bookname = bookname.strip()  # strip whitespace
                    booksub = replace_all(booksub, dic)
                    booksub = booksub.strip()  # strip whitespace
                    if booksub:
                        series, seriesNum = bookSeries(booksub)
                    else:
                        series, seriesNum = bookSeries(bookname)

                    # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions)
                    # and sometimes uses the same bookid if the book is the same but the title is slightly different
                    # We use bookid, then reject if another author/title has a different bookid so we just keep one...
                    find_book_status = myDB.select(
                        'SELECT * FROM books WHERE BookID = "%s"' % bookid)
                    if find_book_status:
                        for resulted in find_book_status:
                            book_status = resulted['Status']
                            locked = resulted['Manual']
                    else:
                        book_status = lazylibrarian.NEWBOOK_STATUS
                        locked = False

                    rejected = False

                    if re.match('[^\w-]', bookname
                                ):  # reject books with bad characters in title
                        logger.debug(u"removed result [" + bookname +
                                     "] for bad characters")
                        removedResults = removedResults + 1
                        rejected = True

                    if not rejected and not bookname:
                        logger.debug(
                            'Rejecting bookid %s for %s, no bookname' %
                            (bookid, authorNameResult))
                        removedResults = removedResults + 1
                        rejected = True

                    if not rejected:
                        find_books = myDB.select(
                            'SELECT * FROM books WHERE BookName = "%s" and AuthorName = "%s"'
                            % (bookname, authorNameResult))
                        if find_books:
                            for find_book in find_books:
                                if find_book['BookID'] != bookid:
                                    # we have a book with this author/title already
                                    logger.debug(
                                        'Rejecting bookid %s for [%s][%s] already got %s'
                                        % (find_book['BookID'],
                                           authorNameResult, bookname, bookid))
                                    duplicates = duplicates + 1
                                    rejected = True
                                    break

                    if not rejected:
                        find_books = myDB.select(
                            'SELECT * FROM books WHERE BookID = "%s"' % bookid)
                        if find_books:
                            # we have a book with this bookid already
                            logger.debug(
                                'Rejecting bookid %s for [%s][%s] already got this bookid in database'
                                % (bookid, authorNameResult, bookname))
                            duplicates = duplicates + 1
                            rejected = True
                            break

                    if not rejected:
                        if book_status != "Ignored":
                            if not locked:
                                controlValueDict = {"BookID": bookid}
                                newValueDict = {
                                    "AuthorName": authorNameResult,
                                    "AuthorID": authorid,
                                    "AuthorLink": None,
                                    "BookName": bookname,
                                    "BookSub": booksub,
                                    "BookDesc": bookdesc,
                                    "BookIsbn": bookisbn,
                                    "BookPub": bookpub,
                                    "BookGenre": None,
                                    "BookImg": bookimg,
                                    "BookLink": booklink,
                                    "BookRate": bookrate,
                                    "BookPages": bookpages,
                                    "BookDate": pubyear,
                                    "BookLang": bookLanguage,
                                    "Status": book_status,
                                    "BookAdded": today(),
                                    "Series": series,
                                    "SeriesNum": seriesNum
                                }

                                resultsCount = resultsCount + 1

                                myDB.upsert("books", newValueDict,
                                            controlValueDict)
                                logger.debug(u"Book found: " +
                                             book.find('title').text + " " +
                                             pubyear)

                            if 'nocover' in bookimg or 'nophoto' in bookimg:
                                # try to get a cover from librarything
                                workcover = getBookCover(bookid)
                                if workcover:
                                    logger.debug(
                                        u'Updated cover for %s to %s' %
                                        (bookname, workcover))
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {"BookImg": workcover}
                                    myDB.upsert("books", newValueDict,
                                                controlValueDict)

                            elif bookimg and bookimg.startswith('http'):
                                link = cache_cover(bookid, bookimg)
                                if link is not None:
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {"BookImg": link}
                                    myDB.upsert("books", newValueDict,
                                                controlValueDict)

                            if seriesNum == None:
                                # try to get series info from librarything
                                series, seriesNum = getWorkSeries(bookid)
                                if seriesNum:
                                    logger.debug(u'Updated series: %s [%s]' %
                                                 (series, seriesNum))
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {
                                        "Series": series,
                                        "SeriesNum": seriesNum
                                    }
                                    myDB.upsert("books", newValueDict,
                                                controlValueDict)

                            worklink = getWorkPage(bookid)
                            if worklink:
                                controlValueDict = {"BookID": bookid}
                                newValueDict = {"WorkPage": worklink}
                                myDB.upsert("books", newValueDict,
                                            controlValueDict)

                            if not find_book_status:
                                logger.debug(u"[%s] Added book: %s" %
                                             (authorname, bookname))
                                added_count = added_count + 1
                            else:
                                logger.debug(u"[%s] Updated book: %s" %
                                             (authorname, bookname))
                                updated_count = updated_count + 1
                        else:
                            book_ignore_count = book_ignore_count + 1

                loopCount = loopCount + 1
                URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + \
                      urllib.urlencode(self.params) + '&page=' + str(loopCount)
                resultxml = None
                try:
                    rootxml, in_cache = get_xml_request(URL,
                                                        useCache=not refresh)
                    if rootxml is None:
                        logger.debug('Error requesting next page of results')
                    else:
                        resultxml = rootxml.getiterator('book')
                        if not in_cache:
                            api_hits = api_hits + 1
                except Exception as e:
                    resultxml = None
                    logger.error("Error finding next page of results: %s" % e)

                if resultxml is not None:
                    if all(False for book in
                           resultxml):  # returns True if iterator is empty
                        resultxml = None

        lastbook = myDB.action(
            'SELECT BookName, BookLink, BookDate from books WHERE AuthorID="%s" \
                                AND Status != "Ignored" order by BookDate DESC'
            % authorid).fetchone()
        if lastbook:
            lastbookname = lastbook['BookName']
            lastbooklink = lastbook['BookLink']
            lastbookdate = lastbook['BookDate']
        else:
            lastbookname = None
            lastbooklink = None
            lastbookdate = None

        controlValueDict = {"AuthorID": authorid}
        newValueDict = {
            "Status": "Active",
            "LastBook": lastbookname,
            "LastLink": lastbooklink,
            "LastDate": lastbookdate
        }
        myDB.upsert("authors", newValueDict, controlValueDict)

        # This is here because GoodReads sometimes has several entries with the same BookID!
        modified_count = added_count + updated_count

        logger.debug("Found %s total book%s for author" %
                     (total_count, plural(total_count)))
        logger.debug("Removed %s bad language result%s for author" %
                     (ignored, plural(ignored)))
        logger.debug(
            "Removed %s bad character or no-name result%s for author" %
            (removedResults, plural(removedResults)))
        logger.debug("Removed %s duplicate result%s for author" %
                     (duplicates, plural(duplicates)))
        logger.debug("Ignored %s book%s by author marked as Ignored" %
                     (book_ignore_count, plural(book_ignore_count)))
        logger.debug("Imported/Updated %s book%s for author" %
                     (modified_count, plural(modified_count)))

        myDB.action(
            'insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)'
            %
            (authorname, api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change,
             cache_hits, ignored, removedResults, not_cached, duplicates))

        if refresh:
            logger.info(
                "[%s] Book processing complete: Added %s book%s / Updated %s book%s"
                % (authorname, added_count, plural(added_count), updated_count,
                   plural(updated_count)))
        else:
            logger.info(
                "[%s] Book processing complete: Added %s book%s to the database"
                % (authorname, added_count, plural(added_count)))

        return books_dict
Exemplo n.º 17
0
def getSeriesAuthors(seriesid):
    """ Get a list of authors contributing to a series
        and import those authors (and their books) into the database
        Return how many authors you added """
    myDB = database.DBConnection()
    result = myDB.match("select count('AuthorID') as counter from authors")
    start = int(result['counter'])
    result = myDB.match('select SeriesName from series where SeriesID=?',
                        (seriesid, ))
    seriesname = result['SeriesName']
    members = getSeriesMembers(seriesid)
    if members:
        myDB = database.DBConnection()
        for member in members:
            # order = member[0]
            bookname = member[1]
            authorname = member[2]

            base_url = 'https://www.goodreads.com/search.xml?q='
            params = {"key": lazylibrarian.CONFIG['GR_API']}
            searchname = bookname + ' ' + authorname
            searchname = cleanName(unaccented(searchname))
            searchname = searchname.encode(lazylibrarian.SYS_ENCODING)
            searchterm = urllib.quote_plus(searchname)
            set_url = base_url + searchterm + '&' + urllib.urlencode(params)
            authorid = ''
            try:
                rootxml, in_cache = get_xml_request(set_url)
                if rootxml is None:
                    logger.warn('Error getting XML for %s' % searchname)
                else:
                    resultxml = rootxml.getiterator('work')
                    for item in resultxml:
                        try:
                            booktitle = item.find('./best_book/title').text
                        except (KeyError, AttributeError):
                            booktitle = ""
                        book_fuzz = fuzz.token_set_ratio(booktitle, bookname)
                        if book_fuzz >= 98:
                            try:
                                author = item.find(
                                    './best_book/author/name').text
                            except (KeyError, AttributeError):
                                author = ""
                            try:
                                authorid = item.find(
                                    './best_book/author/id').text
                            except (KeyError, AttributeError):
                                authorid = ""
                            logger.debug(
                                "Author Search found %s %s, authorid %s" %
                                (author, booktitle, authorid))
                            break
                if not authorid:  # try again with title only
                    searchname = cleanName(unaccented(bookname))
                    searchname = searchname.encode(lazylibrarian.SYS_ENCODING)
                    searchterm = urllib.quote_plus(searchname)
                    set_url = base_url + searchterm + '&' + urllib.urlencode(
                        params)
                    rootxml, in_cache = get_xml_request(set_url)
                    if rootxml is None:
                        logger.warn('Error getting XML for %s' % searchname)
                    else:
                        resultxml = rootxml.getiterator('work')
                        for item in resultxml:
                            booktitle = item.find('./best_book/title').text
                            book_fuzz = fuzz.token_set_ratio(
                                booktitle, bookname)
                            if book_fuzz >= 98:
                                try:
                                    author = item.find(
                                        './best_book/author/name').text
                                except (KeyError, AttributeError):
                                    author = ""
                                try:
                                    authorid = item.find(
                                        './best_book/author/id').text
                                except (KeyError, AttributeError):
                                    authorid = ""
                                logger.debug(
                                    "Title Search found %s %s, authorid %s" %
                                    (author, booktitle, authorid))
                                break
                if not authorid:
                    logger.warn("GoodReads doesn't know about %s %s" %
                                (authorname, bookname))
            except Exception as e:
                logger.error("Error finding goodreads results: %s %s" %
                             (type(e).__name__, str(e)))

            if authorid:
                lazylibrarian.importer.addAuthorToDB(refresh=False,
                                                     authorid=authorid)

    result = myDB.match("select count('AuthorID') as counter from authors")
    finish = int(result['counter'])
    newauth = finish - start
    logger.info("Added %s new author%s for %s" %
                (newauth, plural(newauth), seriesname))
    return newauth
Exemplo n.º 18
0
    def find_book(self, bookid=None, queue=None):
        myDB = database.DBConnection()

        URL = 'https://www.goodreads.com/book/show/' + bookid + '?' + urllib.urlencode(
            self.params)

        try:
            rootxml, in_cache = get_xml_request(URL)
            if rootxml is None:
                logger.debug("Error requesting book")
                return
        except Exception as e:
            logger.error("Error finding book: %s" % e)
            return

        bookLanguage = rootxml.find('./book/language_code').text
        bookname = rootxml.find('./book/title').text

        if not bookLanguage:
            bookLanguage = "Unknown"
#
# PAB user has said they want this book, don't block for bad language, just warn
#
        valid_langs = ([
            valid_lang.strip()
            for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')
        ])
        if bookLanguage not in valid_langs:
            logger.debug('Book %s language does not match preference' %
                         bookname)

        if (rootxml.find('./book/publication_year').text is None):
            bookdate = "0000"
        else:
            bookdate = rootxml.find('./book/publication_year').text

        try:
            bookimg = rootxml.find('./book/img_url').text
            if 'assets/nocover' in bookimg:
                bookimg = 'images/nocover.png'
        except (KeyError, AttributeError):
            bookimg = 'images/nocover.png'

        authorname = rootxml.find('./book/authors/author/name').text
        bookdesc = rootxml.find('./book/description').text
        bookisbn = rootxml.find('./book/isbn').text
        bookpub = rootxml.find('./book/publisher').text
        booklink = rootxml.find('./book/link').text
        bookrate = float(rootxml.find('./book/average_rating').text)
        bookpages = rootxml.find('.book/num_pages').text

        name = authorname
        GR = GoodReads(name)
        author = GR.find_author_id()
        if author:
            AuthorID = author['authorid']

        booksub = ''
        bookname = unaccented(bookname)
        if ': ' in bookname:
            parts = bookname.split(': ', 1)
            bookname = parts[0]
            booksub = parts[1]

        dic = {':': '', '"': '', '\'': ''}
        bookname = replace_all(bookname, dic)
        bookname = bookname.strip()  # strip whitespace
        booksub = replace_all(booksub, dic)
        booksub = booksub.strip()  # strip whitespace
        if booksub:
            series, seriesNum = bookSeries(booksub)
        else:
            series, seriesNum = bookSeries(bookname)

        controlValueDict = {"BookID": bookid}
        newValueDict = {
            "AuthorName": authorname,
            "AuthorID": AuthorID,
            "AuthorLink": None,
            "BookName": bookname,
            "BookSub": booksub,
            "BookDesc": bookdesc,
            "BookIsbn": bookisbn,
            "BookPub": bookpub,
            "BookGenre": None,
            "BookImg": bookimg,
            "BookLink": booklink,
            "BookRate": bookrate,
            "BookPages": bookpages,
            "BookDate": bookdate,
            "BookLang": bookLanguage,
            "Status": "Wanted",
            "BookAdded": today(),
            "Series": series,
            "SeriesNum": seriesNum
        }

        myDB.upsert("books", newValueDict, controlValueDict)
        logger.debug("%s added to the books database" % bookname)

        if 'nocover' in bookimg or 'nophoto' in bookimg:
            # try to get a cover from librarything
            workcover = getBookCover(bookid)
            if workcover:
                logger.debug(u'Updated cover for %s to %s' %
                             (bookname, workcover))
                controlValueDict = {"BookID": bookid}
                newValueDict = {"BookImg": workcover}
                myDB.upsert("books", newValueDict, controlValueDict)

        elif bookimg and bookimg.startswith('http'):
            link = cache_cover(bookid, bookimg)
            if link is not None:
                controlValueDict = {"BookID": bookid}
                newValueDict = {"BookImg": link}
                myDB.upsert("books", newValueDict, controlValueDict)

        if seriesNum == None:
            #  try to get series info from librarything
            series, seriesNum = getWorkSeries(bookid)
            if seriesNum:
                logger.debug(u'Updated series: %s [%s]' % (series, seriesNum))
                controlValueDict = {"BookID": bookid}
                newValueDict = {"Series": series, "SeriesNum": seriesNum}
                myDB.upsert("books", newValueDict, controlValueDict)

        worklink = getWorkPage(bookid)
        if worklink:
            controlValueDict = {"BookID": bookid}
            newValueDict = {"WorkPage": worklink}
            myDB.upsert("books", newValueDict, controlValueDict)
Exemplo n.º 19
0
    def get_author_books(self, authorid=None, authorname=None, refresh=False):
      try:
        api_hits = 0
        gr_lang_hits = 0
        lt_lang_hits = 0
        gb_lang_change = 0
        cache_hits = 0
        not_cached = 0
        URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + urllib.urlencode(self.params)

        # Artist is loading
        myDB = database.DBConnection()
        controlValueDict = {"AuthorID": authorid}
        newValueDict = {"Status": "Loading"}
        myDB.upsert("authors", newValueDict, controlValueDict)

        try:
            rootxml, in_cache = get_xml_request(URL, useCache=not refresh)
        except Exception as e:
            logger.error("Error fetching author books: %s" % str(e))
            return
        if rootxml is None:
            logger.debug("Error requesting author books")
            return
        if not in_cache:
            api_hits = api_hits + 1
        resultxml = rootxml.getiterator('book')

        valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')])

        if not len(resultxml):
            logger.warn('[%s] No books found for author with ID: %s' % (authorname, authorid))
        else:
            logger.debug("[%s] Now processing books with GoodReads API" % authorname)

            resultsCount = 0
            removedResults = 0
            duplicates = 0
            ignored = 0
            added_count = 0
            updated_count = 0
            book_ignore_count = 0
            total_count = 0
            logger.debug(u"url " + URL)

            authorNameResult = rootxml.find('./author/name').text
            logger.debug(u"author name " + authorNameResult)
            loopCount = 1

            isbn_979_dict = {
                "10": "fre",
                "11": "kor",
                "12": "ita"
            }
            isbn_978_dict = {
                "0": "eng",
                "1": "eng",
                "2": "fre",
                "3": "ger",
                "4": "jap",
                "5": "rus"
            }

            while resultxml:
                for book in resultxml:
                    total_count = total_count + 1

                    if (book.find('publication_year').text is None):
                        pubyear = "0000"
                    else:
                        pubyear = book.find('publication_year').text

                    try:
                        bookimg = book.find('image_url').text
                        if ('nocover' in bookimg):
                            bookimg = 'images/nocover.png'
                    except (KeyError, AttributeError):
                        bookimg = 'images/nocover.png'

                    bookLanguage = "Unknown"
                    find_field = "id"
                    isbn = ""
                    isbnhead = ""
                    if "All" not in valid_langs:  # do we care about language
                        if book.find('isbn').text:
                            find_field = "isbn"
                            isbn = book.find('isbn').text
                            isbnhead = isbn[0:3]
                        else:
                            if book.find('isbn13').text:
                                find_field = "isbn13"
                                isbn = book.find('isbn13').text
                                isbnhead = isbn[3:6]
                        if (find_field != 'id'):  # isbn10 or isbn13 found
                            # Try to use shortcut of ISBN identifier codes described here...
                            # https://en.wikipedia.org/wiki/List_of_ISBN_identifier_groups
                            if isbnhead != "":
                                if find_field == "isbn13" and isbn.startswith('979'):
                                    for item in isbn_979_dict:
                                        if isbnhead.startswith(item):
                                            bookLanguage = isbn_979_dict[item]
                                            break
                                    if bookLanguage != "Unknown":
                                        logger.debug("ISBN979 returned %s for %s" % (bookLanguage, isbnhead))
                                elif (find_field == "isbn") or (find_field == "isbn13" and isbn.startswith('978')):
                                    for item in isbn_978_dict:
                                        if isbnhead.startswith(item):
                                            bookLanguage = isbn_978_dict[item]
                                            break
                                    if bookLanguage != "Unknown":
                                        logger.debug("ISBN978 returned %s for %s" % (bookLanguage, isbnhead))

                        if bookLanguage == "Unknown":
                            # Nothing in the isbn dictionary, try any cached results
                            match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % (isbnhead))
                            if match:
                                bookLanguage = match['lang']
                                cache_hits = cache_hits + 1
                                logger.debug("Found cached language [%s] for %s [%s]" %
                                             (bookLanguage, find_field, isbnhead))
                            else:
                                # no match in cache, try searching librarything for a language code using the isbn
                                # if no language found, librarything return value is "invalid" or "unknown"
                                # returns plain text, not xml
                                BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + isbn
                                try:
                                    librarything_wait()
                                    resp = urllib2.urlopen(BOOK_URL, timeout=30).read()
                                    lt_lang_hits = lt_lang_hits + 1
                                    logger.debug("LibraryThing reports language [%s] for %s" % (resp, isbnhead))

                                    if ('invalid' in resp or 'Unknown' in resp):
                                        bookLanguage = "Unknown"
                                    else:
                                        bookLanguage = resp  # found a language code
                                        myDB.action('insert into languages values ("%s", "%s")' %
                                                    (isbnhead, bookLanguage))
                                        logger.debug(u"LT language %s: %s" % (isbnhead, bookLanguage))
                                except Exception as e:
                                    logger.error("Error finding LT language result for [%s], %s" % (isbn, str(e)))

                        if bookLanguage == "Unknown":
                            # still  no earlier match, we'll have to search the goodreads api
                            try:
                                if book.find(find_field).text:
                                    BOOK_URL = 'http://www.goodreads.com/book/show?id=' + \
                                        book.find(find_field).text + '&' + urllib.urlencode(self.params)
                                    logger.debug(u"Book URL: " + BOOK_URL)

                                    try:
                                        time_now = int(time.time())
                                        if time_now <= lazylibrarian.LAST_GOODREADS:
                                            time.sleep(1)

                                        BOOK_rootxml, in_cache = get_xml_request(BOOK_URL)
                                        if BOOK_rootxml is None:
                                            logger.debug('Error requesting book language code')
                                            bookLanguage = ""
                                        else:
                                            if not in_cache:
                                                # only update last_goodreads if the result wasn't found in the cache
                                                lazylibrarian.LAST_GOODREADS = time_now
                                            bookLanguage = BOOK_rootxml.find('./book/language_code').text
                                    except Exception as e:
                                        logger.error("Error finding book results: %s" % str(e))
                                    if not in_cache:
                                        gr_lang_hits = gr_lang_hits + 1
                                    if not bookLanguage:
                                        bookLanguage = "Unknown"
                                        # At this point, give up?
                                        # WhatWork on author/title doesn't give us a language.
                                        # It might give us the "original language" of the book (but not always)
                                        # and our copy might not be in the original language anyway
                                        # eg "The Girl With the Dragon Tattoo" original language Swedish
                                        # If we have an isbn, try WhatISBN to get alternatives
                                        # in case any of them give us a language, but it seems if thinglang doesn't
                                        # have a language for the first isbn code, it doesn't for any of the
                                        # alternatives either
                                        # Goodreads search results don't include the language. Although sometimes
                                        # it's in the html page, it's not in the xml results


                                    if (isbnhead != ""):
                                        # if GR didn't give an isbn we can't cache it, just use language for this book
                                        myDB.action('insert into languages values ("%s", "%s")' %
                                                    (isbnhead, bookLanguage))
                                        logger.debug("GoodReads reports language [%s] for %s" %
                                                     (bookLanguage, isbnhead))
                                    else:
                                        not_cached = not_cached + 1

                                    logger.debug(u"GR language: " + bookLanguage)
                                else:
                                    logger.debug("No %s provided for [%s]" % (find_field, book.find('title').text))
                                    # continue

                            except Exception as e:
                                logger.debug(u"Goodreads language search failed: %s" % str(e))

                        if bookLanguage not in valid_langs:
                            logger.debug('Skipped %s with language %s' % (book.find('title').text, bookLanguage))
                            ignored = ignored + 1
                            continue
                    bookname = book.find('title').text
                    bookid = book.find('id').text
                    bookdesc = book.find('description').text
                    bookisbn = book.find('isbn').text
                    bookpub = book.find('publisher').text
                    booklink = book.find('link').text
                    bookrate = float(book.find('average_rating').text)
                    bookpages = book.find('num_pages').text
                    bookname = unaccented(bookname)

                    bookname, booksub = split_title(authorNameResult, bookname)

                    dic = {':': '', '"': '', '\'': ''}
                    bookname = replace_all(bookname, dic)
                    bookname = bookname.strip()  # strip whitespace
                    booksub = replace_all(booksub, dic)
                    booksub = booksub.strip()  # strip whitespace
                    if booksub:
                        series, seriesNum = bookSeries(booksub)
                    else:
                        series, seriesNum = bookSeries(bookname)

                    # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions)
                    # and sometimes uses the same bookid if the book is the same but the title is slightly different
                    # We use bookid, then reject if another author/title has a different bookid so we just keep one...
                    find_book_status = myDB.select('SELECT * FROM books WHERE BookID = "%s"' % bookid)
                    if find_book_status:
                        for resulted in find_book_status:
                            book_status = resulted['Status']
                            locked = resulted['Manual']
                    else:
                        book_status = lazylibrarian.NEWBOOK_STATUS
                        locked = False

                    rejected = False

                    if re.match('[^\w-]', bookname):  # reject books with bad characters in title
                        logger.debug(u"removed result [" + bookname + "] for bad characters")
                        removedResults = removedResults + 1
                        rejected = True

                    if not rejected and not bookname:
                        logger.debug('Rejecting bookid %s for %s, no bookname' %
                                     (bookid, authorNameResult))
                        removedResults = removedResults + 1
                        rejected = True

                    if not rejected:
                        find_books = myDB.select('SELECT * FROM books WHERE BookName = "%s" and AuthorName = "%s"' %
                                                 (bookname, authorNameResult.replace('"', '""')))
                        if find_books:
                            for find_book in find_books:
                                if find_book['BookID'] != bookid:
                                    # we have a book with this author/title already
                                    logger.debug('Rejecting bookid %s for [%s][%s] already got %s' %
                                                 (find_book['BookID'], authorNameResult, bookname, bookid))
                                    duplicates = duplicates + 1
                                    rejected = True

                    if not rejected:
                        find_books = myDB.match('SELECT AuthorName,BookName FROM books WHERE BookID = "%s"' % bookid)
                        if find_books:
                            # we have a book with this bookid already
                            if bookname != find_books['BookName'] or authorNameResult != find_books['AuthorName']:
                                logger.debug('Rejecting bookid %s for [%s][%s] already got bookid for [%s][%s]' %
                                            (bookid, authorNameResult, bookname,
                                             find_books['AuthorName'], find_books['BookName']))
                            else:
                                logger.debug('Rejecting bookid %s for [%s][%s] already got this book in database' %
                                             (bookid, authorNameResult, bookname))
                            duplicates = duplicates + 1
                            rejected = True

                    if not rejected:
                        if book_status != "Ignored":
                            if not locked:
                                controlValueDict = {"BookID": bookid}
                                newValueDict = {
                                    "AuthorName": authorNameResult,
                                    "AuthorID": authorid,
                                    "AuthorLink": None,
                                    "BookName": bookname,
                                    "BookSub": booksub,
                                    "BookDesc": bookdesc,
                                    "BookIsbn": bookisbn,
                                    "BookPub": bookpub,
                                    "BookGenre": None,
                                    "BookImg": bookimg,
                                    "BookLink": booklink,
                                    "BookRate": bookrate,
                                    "BookPages": bookpages,
                                    "BookDate": pubyear,
                                    "BookLang": bookLanguage,
                                    "Status": book_status,
                                    "BookAdded": today(),
                                    "Series": series,
                                    "SeriesNum": seriesNum
                                }

                                resultsCount = resultsCount + 1

                                myDB.upsert("books", newValueDict, controlValueDict)
                                logger.debug(u"Book found: " + book.find('title').text + " " + pubyear)

                            if 'nocover' in bookimg or 'nophoto' in bookimg:
                                # try to get a cover from librarything
                                workcover = getBookCover(bookid)
                                if workcover:
                                    logger.debug(u'Updated cover for %s to %s' % (bookname, workcover))
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {"BookImg": workcover}
                                    myDB.upsert("books", newValueDict, controlValueDict)

                            elif bookimg and bookimg.startswith('http'):
                                link = cache_cover(bookid, bookimg)
                                if link:
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {"BookImg": link}
                                    myDB.upsert("books", newValueDict, controlValueDict)

                            if seriesNum is None:
                                # try to get series info from librarything
                                series, seriesNum = getWorkSeries(bookid)
                                if seriesNum:
                                    logger.debug(u'Updated series: %s [%s]' % (series, seriesNum))
                                    controlValueDict = {"BookID": bookid}
                                    newValueDict = {
                                        "Series": series,
                                        "SeriesNum": seriesNum
                                    }
                                    myDB.upsert("books", newValueDict, controlValueDict)

                            worklink = getWorkPage(bookid)
                            if worklink:
                                controlValueDict = {"BookID": bookid}
                                newValueDict = {"WorkPage": worklink}
                                myDB.upsert("books", newValueDict, controlValueDict)

                            if not find_book_status:
                                logger.debug(u"[%s] Added book: %s" % (authorname, bookname))
                                added_count = added_count + 1
                            else:
                                logger.debug(u"[%s] Updated book: %s" % (authorname, bookname))
                                updated_count = updated_count + 1
                        else:
                            book_ignore_count = book_ignore_count + 1

                loopCount = loopCount + 1
                URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + \
                      urllib.urlencode(self.params) + '&page=' + str(loopCount)
                resultxml = None
                try:
                    rootxml, in_cache = get_xml_request(URL, useCache=not refresh)
                    if rootxml is None:
                        logger.debug('Error requesting next page of results')
                    else:
                        resultxml = rootxml.getiterator('book')
                        if not in_cache:
                            api_hits = api_hits + 1
                except Exception as e:
                    resultxml = None
                    logger.error("Error finding next page of results: %s" % str(e))

                if resultxml:
                    if all(False for book in resultxml):  # returns True if iterator is empty
                        resultxml = None

        lastbook = myDB.match('SELECT BookName, BookLink, BookDate from books WHERE AuthorID="%s" \
                                AND Status != "Ignored" order by BookDate DESC' % authorid)
        if lastbook:
            lastbookname = lastbook['BookName']
            lastbooklink = lastbook['BookLink']
            lastbookdate = lastbook['BookDate']
        else:
            lastbookname = None
            lastbooklink = None
            lastbookdate = None

        controlValueDict = {"AuthorID": authorid}
        newValueDict = {
            "Status": "Active",
            "LastBook": lastbookname,
            "LastLink": lastbooklink,
            "LastDate": lastbookdate
        }
        myDB.upsert("authors", newValueDict, controlValueDict)

        # This is here because GoodReads sometimes has several entries with the same BookID!
        modified_count = added_count + updated_count

        logger.debug("Found %s total book%s for author" % (total_count, plural(total_count)))
        logger.debug("Removed %s unwanted language result%s for author" % (ignored, plural(ignored)))
        logger.debug(
            "Removed %s bad character or no-name result%s for author" %
            (removedResults, plural(removedResults)))
        logger.debug("Removed %s duplicate result%s for author" % (duplicates, plural(duplicates)))
        logger.debug("Found %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count)))
        logger.debug("Imported/Updated %s book%s for author" % (modified_count, plural(modified_count)))

        myDB.action('insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' %
                    (authorname.replace('"', '""'), api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change,
                     cache_hits, ignored, removedResults, not_cached, duplicates))

        if refresh:
            logger.info("[%s] Book processing complete: Added %s book%s / Updated %s book%s" %
                        (authorname, added_count, plural(added_count), updated_count, plural(updated_count)))
        else:
            logger.info("[%s] Book processing complete: Added %s book%s to the database" %
                        (authorname, added_count, plural(added_count)))

      except Exception as e:
        logger.error('Unhandled exception in GR.get_author_books: %s' % traceback.format_exc())