def find_book(self, bookid=None, queue=None): myDB = database.DBConnection() URL = 'https://www.goodreads.com/book/show/' + bookid + '?' + urllib.urlencode( self.params) try: rootxml, in_cache = get_xml_request(URL) if rootxml is None: logger.debug("Error requesting book") return except Exception as e: logger.error("Error finding book: %s" % e) return bookLanguage = rootxml.find('./book/language_code').text bookname = rootxml.find('./book/title').text if not bookLanguage: bookLanguage = "Unknown" # # PAB user has said they want this book, don't block for bad language, just warn # valid_langs = ([ valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',') ]) if bookLanguage not in valid_langs: logger.debug('Book %s language does not match preference' % bookname) if (rootxml.find('./book/publication_year').text is None): bookdate = "0000" else: bookdate = rootxml.find('./book/publication_year').text try: bookimg = rootxml.find('./book/img_url').text if 'assets/nocover' in bookimg: bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' authorname = rootxml.find('./book/authors/author/name').text bookdesc = rootxml.find('./book/description').text bookisbn = rootxml.find('./book/isbn').text bookpub = rootxml.find('./book/publisher').text booklink = rootxml.find('./book/link').text bookrate = float(rootxml.find('./book/average_rating').text) bookpages = rootxml.find('.book/num_pages').text name = authorname GR = GoodReads(name) author = GR.find_author_id() if author: AuthorID = author['authorid'] booksub = '' bookname = unaccented(bookname) if ': ' in bookname: parts = bookname.split(': ', 1) bookname = parts[0] booksub = parts[1] dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booksub = replace_all(booksub, dic) booksub = booksub.strip() # strip whitespace if booksub: series, seriesNum = bookSeries(booksub) else: series, seriesNum = bookSeries(bookname) controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorname, "AuthorID": AuthorID, "AuthorLink": None, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": None, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": bookLanguage, "Status": "Wanted", "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) logger.debug("%s added to the books database" % bookname) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link is not None: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum == None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = {"Series": series, "SeriesNum": seriesNum} myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict)
def dbupgrade(db_current_version): try: myDB = database.DBConnection() db_version = 0 result = myDB.match('PRAGMA user_version') if result and result[0]: value = str(result[0]) if value.isdigit(): db_version = int(value) check = myDB.match('PRAGMA integrity_check') if check and check[0]: result = check[0] if result == 'ok': logger.debug('Database integrity check: %s' % result) else: logger.error('Database integrity check: %s' % result) # should probably abort now if db_version < db_current_version: myDB = database.DBConnection() if db_version < 1: if not has_column(myDB, "authors", "AuthorID"): # it's a new database. Create tables but no need for any upgrading db_version = db_current_version lazylibrarian.UPDATE_MSG = 'Creating new database, version %s' % db_version else: lazylibrarian.UPDATE_MSG = 'Updating database to version %s, current version is %s' % ( db_current_version, db_version) logger.info(lazylibrarian.UPDATE_MSG) myDB.action( 'CREATE TABLE IF NOT EXISTS authors (AuthorID TEXT UNIQUE, AuthorName TEXT UNIQUE, \ AuthorImg TEXT, AuthorLink TEXT, DateAdded TEXT, Status TEXT, LastBook TEXT, LastBookImg TEXT, \ LastLink Text, LastDate TEXT, HaveBooks INTEGER, TotalBooks INTEGER, AuthorBorn TEXT, \ AuthorDeath TEXT, UnignoredBooks INTEGER, Manual TEXT)') myDB.action('CREATE TABLE IF NOT EXISTS books (AuthorID TEXT, \ BookName TEXT, BookSub TEXT, BookDesc TEXT, BookGenre TEXT, BookIsbn TEXT, BookPub TEXT, \ BookRate INTEGER, BookImg TEXT, BookPages INTEGER, BookLink TEXT, BookID TEXT UNIQUE, BookFile TEXT, \ BookDate TEXT, BookLang TEXT, BookAdded TEXT, Status TEXT, WorkPage TEXT, Manual TEXT)' ) myDB.action( 'CREATE TABLE IF NOT EXISTS wanted (BookID TEXT, NZBurl TEXT, NZBtitle TEXT, NZBdate TEXT, \ NZBprov TEXT, Status TEXT, NZBsize TEXT, AuxInfo TEXT, NZBmode TEXT, Source TEXT, DownloadID TEXT)' ) myDB.action( 'CREATE TABLE IF NOT EXISTS pastissues AS SELECT * FROM wanted WHERE 0' ) # same columns myDB.action( 'CREATE TABLE IF NOT EXISTS magazines (Title TEXT UNIQUE, Regex TEXT, Status TEXT, \ MagazineAdded TEXT, LastAcquired TEXT, IssueDate TEXT, IssueStatus TEXT, Reject TEXT, \ LatestCover TEXT)') myDB.action( 'CREATE TABLE IF NOT EXISTS languages (isbn TEXT, lang TEXT)' ) myDB.action( 'CREATE TABLE IF NOT EXISTS issues (Title TEXT, IssueID TEXT UNIQUE, IssueAcquired TEXT, \ IssueDate TEXT, IssueFile TEXT)') myDB.action( 'CREATE TABLE IF NOT EXISTS stats (authorname text, GR_book_hits int, GR_lang_hits int, \ LT_lang_hits int, GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int, \ duplicates int)') myDB.action( 'CREATE TABLE IF NOT EXISTS series (SeriesID INTEGER PRIMARY KEY, SeriesName TEXT, \ Status TEXT)') myDB.action( 'CREATE TABLE IF NOT EXISTS member (SeriesID INTEGER, BookID TEXT, SeriesNum TEXT)' ) myDB.action( 'CREATE TABLE IF NOT EXISTS seriesauthors (SeriesID INTEGER, AuthorID TEXT)' ) # These are the incremental changes before database versioning was introduced. # Old database tables might already have these incorporated depending on version, so we need to check... if db_version < 1: if not has_column(myDB, "books", "BookSub"): lazylibrarian.UPDATE_MSG = 'Updating database to hold book subtitles.' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE books ADD COLUMN BookSub TEXT') if not has_column(myDB, "books", "BookSub"): lazylibrarian.UPDATE_MSG = 'Updating database to hold book publisher' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE books ADD COLUMN BookPub TEXT') if not has_column(myDB, "books", "BookGenre"): lazylibrarian.UPDATE_MSG = 'Updating database to hold bookgenre' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE books ADD COLUMN BookGenre TEXT') if not has_column(myDB, "books", "BookFile"): lazylibrarian.UPDATE_MSG = 'Updating database to hold book filename' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE books ADD COLUMN BookFile TEXT') if not has_column(myDB, "wanted", "AuxInfo"): lazylibrarian.UPDATE_MSG = 'Updating database to hold AuxInfo' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE wanted ADD COLUMN AuxInfo TEXT') if not has_column(myDB, "wanted", "NZBsize"): lazylibrarian.UPDATE_MSG = 'Updating database to hold NZBsize' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE wanted ADD COLUMN NZBsize TEXT') if not has_column(myDB, "wanted", "NZBmode"): lazylibrarian.UPDATE_MSG = 'Updating database to hold NZBmode' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE wanted ADD COLUMN NZBmode TEXT') if not has_column(myDB, "authors", "UnignoredBooks"): lazylibrarian.UPDATE_MSG = 'Updating database to hold UnignoredBooks' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action( 'ALTER TABLE authors ADD COLUMN UnignoredBooks INTEGER' ) if not has_column(myDB, "magazines", "IssueStatus"): lazylibrarian.UPDATE_MSG = 'Updating database to hold IssueStatus' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action( 'ALTER TABLE magazines ADD COLUMN IssueStatus TEXT') addedWorkPage = False if not has_column(myDB, "books", "WorkPage"): lazylibrarian.UPDATE_MSG = 'Updating database to hold WorkPage' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE books ADD COLUMN WorkPage TEXT') addedWorkPage = True addedSeries = False if not has_column(myDB, "series", "SeriesID") and not has_column( myDB, "books", "Series"): lazylibrarian.UPDATE_MSG = 'Updating database to hold Series' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE books ADD COLUMN Series TEXT') addedSeries = True # SeriesOrder shouldn't be an integer, some later written books # and novellas logically go inbetween books of the main series, # and their SeriesOrder is not an integer, eg 1.5 # so we need to update SeriesOrder to store as text. # Because sqlite can't drop columns we create a new column SeriesNum, # inherit the old column values, and use SeriesNum instead if not has_column(myDB, "books", "SeriesNum") and has_column( myDB, "books", "SeriesOrder"): # no SeriesNum column, so create one lazylibrarian.UPDATE_MSG = 'Updating books to hold SeriesNum' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE books ADD COLUMN SeriesNum TEXT') myDB.action('UPDATE books SET SeriesNum = SeriesOrder') myDB.action('UPDATE books SET SeriesOrder = Null') addedIssues = False if not has_column(myDB, "issues", "Title"): lazylibrarian.UPDATE_MSG = 'Updating database to hold Issues table' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action( 'CREATE TABLE issues (Title TEXT, IssueID TEXT, IssueAcquired TEXT, IssueDate TEXT, IssueFile TEXT)' ) addedIssues = True if not has_column(myDB, "issues", "IssueID"): lazylibrarian.UPDATE_MSG = 'Updating Issues table to hold IssueID' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE issues ADD COLUMN IssueID TEXT') addedIssues = True myDB.action('DROP TABLE if exists capabilities') if addedIssues: try: magazinescan.magazineScan() except Exception as e: logger.debug("Failed to scan magazines, %s" % str(e)) if addedWorkPage: try: lazylibrarian.UPDATE_MSG = 'Adding WorkPage to existing books' logger.debug(lazylibrarian.UPDATE_MSG) threading.Thread(target=bookwork.setWorkPages, name="ADDWORKPAGE", args=[]).start() except Exception as e: logger.debug("Failed to update WorkPages, %s" % str(e)) if addedSeries: try: books = myDB.select( 'SELECT BookID, BookName FROM books') if books: lazylibrarian.UPDATE_MSG = 'Adding series to existing books' logger.debug(lazylibrarian.UPDATE_MSG) tot = len(books) cnt = 0 for book in books: cnt += 1 lazylibrarian.UPDATE_MSG = 'Adding series to existing books: %s of %s' % ( cnt, tot) series, seriesNum = bookSeries( book["BookName"]) if series: controlValueDict = { "BookID": book["BookID"] } newValueDict = { "series": series, "seriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) except Exception as e: logger.error('Error: ' + str(e)) if db_version < 2: try: results = myDB.select( 'SELECT BookID,NZBsize FROM wanted WHERE NZBsize LIKE "% MB"' ) if results: lazylibrarian.UPDATE_MSG = 'Removing units from wanted table' logger.debug(lazylibrarian.UPDATE_MSG) tot = len(results) cnt = 0 for units in results: cnt += 1 lazylibrarian.UPDATE_MSG = 'Removing units from wanted table: %s of %s' % ( cnt, tot) nzbsize = units["NZBsize"] nzbsize = nzbsize.split(' ')[0] myDB.action( 'UPDATE wanted SET NZBsize = "%s" WHERE BookID = "%s"' % (nzbsize, units["BookID"])) except Exception as e: logger.error('Error: ' + str(e)) if db_version < 3: if has_column(myDB, "books", "SeriesOrder"): lazylibrarian.UPDATE_MSG = 'Removing SeriesOrder from books table' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action( 'CREATE TABLE IF NOT EXISTS temp_books (AuthorID TEXT, AuthorName TEXT, AuthorLink TEXT, \ BookName TEXT, BookSub TEXT, BookDesc TEXT, BookGenre TEXT, BookIsbn TEXT, BookPub TEXT, \ BookRate INTEGER, BookImg TEXT, BookPages INTEGER, BookLink TEXT, BookID TEXT UNIQUE, \ BookFile TEXT, BookDate TEXT, BookLang TEXT, BookAdded TEXT, Status TEXT, Series TEXT, \ SeriesNum TEXT, WorkPage TEXT)') myDB.action( 'INSERT INTO temp_books SELECT AuthorID,AuthorName,AuthorLink,BookName,BookSub, \ BookDesc,BookGenre,BookIsbn,BookPub,BookRate,BookImg,BookPages,BookLink,BookID, \ BookFile,BookDate,BookLang,BookAdded,Status,Series,SeriesNum,WorkPage FROM books' ) myDB.action('DROP TABLE books') myDB.action('ALTER TABLE temp_books RENAME TO books') if not has_column(myDB, "pastissues", "BookID"): lazylibrarian.UPDATE_MSG = 'Moving magazine past issues into new table' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action( 'CREATE TABLE pastissues AS SELECT * FROM wanted WHERE Status="Skipped" AND length(AuxInfo) > 0' ) myDB.action( 'DELETE FROM wanted WHERE Status="Skipped" AND length(AuxInfo) > 0' ) if db_version < 4: if not has_column(myDB, "stats", "duplicates"): lazylibrarian.UPDATE_MSG = 'Updating stats table to hold duplicates' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE stats ADD COLUMN duplicates INT') if db_version < 5: issues = myDB.select( 'SELECT IssueID,IssueDate from issues WHERE length(IssueDate) < 4 and length(IssueDate) > 0' ) if issues: lazylibrarian.UPDATE_MSG = 'Updating issues table to hold 4 digit issue numbers' logger.debug(lazylibrarian.UPDATE_MSG) tot = len(issues) cnt = 0 for issue in issues: cnt += 1 lazylibrarian.UPDATE_MSG = 'Updating issues table 4 digits: %s of %s' % ( cnt, tot) issueid = issue['IssueID'] issuedate = str(issue['IssueDate']) issuedate = issuedate.zfill(4) myDB.action( 'UPDATE issues SET IssueDate="%s" WHERE IssueID="%s"' % (issuedate, issueid)) mags = myDB.select( 'SELECT Title,IssueDate from magazines WHERE length(IssueDate) < 4 and length(IssueDate) > 0' ) if mags: lazylibrarian.UPDATE_MSG = 'Updating magazines table to 4 digits' logger.debug(lazylibrarian.UPDATE_MSG) tot = len(mags) cnt = 0 for mag in mags: cnt += 1 lazylibrarian.UPDATE_MSG = 'Updating magazines table to 4 digits: %s of %s' % ( cnt, tot) title = mag['Title'] issuedate = str(mag['IssueDate']) issuedate = issuedate.zfill(4) myDB.action( 'UPDATE magazines SET IssueDate="%s" WHERE Title="%s"' % (issuedate, title)) if db_version < 6: if not has_column(myDB, "books", "Manual"): lazylibrarian.UPDATE_MSG = 'Updating books table to hold Manual setting' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE books ADD COLUMN Manual TEXT') if db_version < 7: if not has_column(myDB, "wanted", "Source"): lazylibrarian.UPDATE_MSG = 'Updating wanted table to hold Source and DownloadID' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE wanted ADD COLUMN Source TEXT') myDB.action( 'ALTER TABLE wanted ADD COLUMN DownloadID TEXT') if db_version < 8: src = os.path.join(lazylibrarian.PROG_DIR, 'data/images/cache/') dst = lazylibrarian.CACHEDIR images = myDB.select( 'SELECT AuthorID, AuthorImg FROM authors WHERE AuthorImg LIKE "images/cache/%"' ) if images: logger.debug('Moving author images to new location') tot = len(images) cnt = 0 for image in images: cnt += 1 lazylibrarian.UPDATE_MSG = "Moving author images to new location: %s of %s" % ( cnt, tot) img = image['AuthorImg'] img = img[7:] myDB.action( 'UPDATE authors SET AuthorImg="%s" WHERE AuthorID="%s"' % (img, image['AuthorID'])) img = img[6:] srcfile = os.path.join(src, img) if os.path.isfile(srcfile): try: shutil.move(os.path.join(src, img), os.path.join(dst, img)) except Exception as e: logger.warn("dbupgrade: %s" % str(e)) logger.debug("Author Image cache updated") images = myDB.select( 'SELECT BookID, BookImg FROM books WHERE BookImg LIKE "images/cache/%"' ) if images: logger.debug('Moving book images to new location') tot = len(images) cnt = 0 for image in images: cnt += 1 lazylibrarian.UPDATE_MSG = "Moving book images to new location: %s of %s" % ( cnt, tot) img = image['BookImg'] img = img[7:] myDB.action( 'UPDATE books SET BookImg="%s" WHERE BookID="%s"' % (img, image['BookID'])) img = img[6:] srcfile = os.path.join(src, img) if os.path.isfile(srcfile): try: shutil.move(srcfile, os.path.join(dst, img)) except Exception as e: logger.warn("dbupgrade: %s" % str(e)) logger.debug("Book Image cache updated") if db_version < 9: if not has_column(myDB, "magazines", "Reject"): # remove frequency column, rename regex to reject, add new regex column for searches lazylibrarian.UPDATE_MSG = 'Updating magazines table' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action( 'CREATE TABLE IF NOT EXISTS temp_table (Title TEXT, Regex TEXT, Status TEXT, \ MagazineAdded TEXT, LastAcquired TEXT, IssueDate TEXT, IssueStatus TEXT, Reject TEXT)' ) myDB.action( 'INSERT INTO temp_table SELECT Title, Regex, Status, MagazineAdded, LastAcquired, \ IssueDate, IssueStatus, Regex FROM magazines') myDB.action('DROP TABLE magazines') myDB.action('ALTER TABLE temp_table RENAME TO magazines') myDB.action('UPDATE magazines SET Regex = Null') if db_version < 10: # make sure columns in pastissues match those in wanted table # needed when upgrading from old 3rd party packages (eg freenas) myDB.action('DROP TABLE pastissues') myDB.action( 'CREATE TABLE pastissues AS SELECT * FROM wanted WHERE 0' ) # same columns, but empty table if db_version < 11: # keep last book image if not has_column(myDB, "authors", "LastBookImg"): lazylibrarian.UPDATE_MSG = 'Updating author table to hold last book image' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action( 'ALTER TABLE authors ADD COLUMN LastBookImg TEXT') books = myDB.select( 'SELECT AuthorID, AuthorName, LastBook from authors') if books: for book in books: lazylibrarian.UPDATE_MSG = 'Updating last book image for %s' % book[ 'AuthorName'] if book['LastBook']: match = myDB.match( 'SELECT BookImg from books WHERE AuthorID="%s" AND BookName="%s"' % (book['AuthorID'], book['LastBook'])) if match: myDB.action( 'UPDATE authors SET LastBookImg="%s" WHERE AuthorID=%s' % (match['BookImg'], book['AuthorID'])) if db_version < 12: # keep last magazine issue image if not has_column(myDB, "Magazines", "LatestCover"): lazylibrarian.UPDATE_MSG = 'Updating magazine table to hold last issue image' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action( 'ALTER TABLE magazines ADD COLUMN LatestCover TEXT') mags = myDB.select( 'SELECT Title, LastAcquired from magazines') if mags: for mag in mags: lazylibrarian.UPDATE_MSG = 'Updating last issue image for %s' % mag[ 'Title'] match = myDB.match( 'SELECT IssueFile from issues WHERE IssueAcquired="%s" AND Title="%s"' % (mag['LastAcquired'], mag['Title'])) if match: coverfile = os.path.splitext( match['IssueFile'])[0] + '.jpg' if os.path.exists(coverfile): myDB.action( 'UPDATE magazines SET LatestCover="%s" WHERE Title="%s"' % (coverfile, mag['Title'])) if db_version < 13: if not has_column(myDB, "authors", "Manual"): lazylibrarian.UPDATE_MSG = 'Updating authors table to hold Manual setting' logger.debug(lazylibrarian.UPDATE_MSG) myDB.action('ALTER TABLE authors ADD COLUMN Manual TEXT') if db_version < 14: src = lazylibrarian.CACHEDIR try: os.mkdir(os.path.join(src, 'author')) except OSError as e: if e.errno is not 17: # already exists is ok logger.debug('mkdir author cache reports: %s' % str(e)) query = 'SELECT AuthorName, AuthorID, AuthorImg FROM authors ' query += 'WHERE AuthorImg LIKE "cache/%" ' query += 'AND AuthorImg NOT LIKE "cache/author/%"' images = myDB.select(query) if images: tot = len(images) logger.debug('Moving %s author images to new location' % tot) cnt = 0 for image in images: cnt += 1 lazylibrarian.UPDATE_MSG = "Moving author images to new location: %s of %s" % ( cnt, tot) try: img = image['AuthorImg'] img = img.rsplit('/', 1)[1] srcfile = os.path.join(src, img) if os.path.isfile(srcfile): try: shutil.move( srcfile, os.path.join(src, "author", img)) myDB.action( 'UPDATE authors SET AuthorImg="cache/author/%s" WHERE AuthorID="%s"' % (img, image['AuthorID'])) except Exception as e: logger.warn("dbupgrade: %s" % str(e)) except Exception as e: logger.warn( 'Failed to update author image for %s: %s' % (image['AuthorName'], str(e))) logger.debug("Author Image cache updated") try: os.mkdir(os.path.join(src, 'book')) except OSError as e: if e.errno is not 17: # already exists is ok logger.debug('mkdir book cache reports: %s' % str(e)) query = 'SELECT BookName, BookID, BookImg FROM books ' query += 'WHERE BookImg LIKE "cache/%" ' query += 'AND BookImg NOT LIKE "cache/book/%"' images = myDB.select(query) if images: tot = len(images) logger.debug('Moving %s book images to new location' % tot) cnt = 0 for image in images: cnt += 1 lazylibrarian.UPDATE_MSG = "Moving book images to new location: %s of %s" % ( cnt, tot) try: img = image['BookImg'] img = img.rsplit('/', 1)[1] srcfile = os.path.join(src, img) if os.path.isfile(srcfile): try: shutil.move(srcfile, os.path.join(src, "book", img)) myDB.action( 'UPDATE books SET BookImg="cache/book/%s" WHERE BookID="%s"' % (img, image['BookID'])) except Exception as e: logger.warn("dbupgrade: %s" % str(e)) except Exception as e: logger.warn( 'Failed to update book image for %s: %s' % (image['BookName'], str(e))) logger.debug("Book Image cache updated") # at this point there should be no more .jpg files in the root of the cachedir # any that are still there are for books/authors deleted from database # or magazine latest issue cover files that get copied as required for image in os.listdir(src): if image.endswith('.jpg'): os.remove(os.path.join(src, image)) if db_version < 15: myDB.action( 'CREATE TABLE IF NOT EXISTS series (SeriesID INTEGER PRIMARY KEY, SeriesName TEXT, \ AuthorID TEXT, Status TEXT)') myDB.action( 'CREATE TABLE IF NOT EXISTS member (SeriesID INTEGER, BookID TEXT, SeriesNum TEXT)' ) if has_column(myDB, "books", "SeriesNum"): lazylibrarian.UPDATE_MSG = 'Populating series and member tables' books = myDB.select( 'SELECT BookID, Series, SeriesNum from books') if books: tot = len(books) logger.debug("Updating book series for %s book%s" % (tot, plural(tot))) cnt = 0 for book in books: cnt += 1 lazylibrarian.UPDATE_MSG = "Updating book series: %s of %s" % ( cnt, tot) seriesdict = getWorkSeries(book['BookID']) if not seriesdict: # no workpage series, use the current values if present if book['Series'] and book['SeriesNum']: seriesdict = { cleanName(unaccented(book['Series'])): book['SeriesNum'] } setSeries(seriesdict, book['BookID'], seriesauthors=False) # deleteEmptySeries # shouldn't be any on first run? lazylibrarian.UPDATE_MSG = "Book series update complete" logger.debug(lazylibrarian.UPDATE_MSG) lazylibrarian.UPDATE_MSG = 'Removing seriesnum from books table' myDB.action( 'CREATE TABLE IF NOT EXISTS temp_table (AuthorID TEXT, AuthorName TEXT, AuthorLink TEXT, \ BookName TEXT, BookSub TEXT, BookDesc TEXT, BookGenre TEXT, BookIsbn TEXT, BookPub TEXT, \ BookRate INTEGER, BookImg TEXT, BookPages INTEGER, BookLink TEXT, BookID TEXT UNIQUE, \ BookFile TEXT, BookDate TEXT, BookLang TEXT, BookAdded TEXT, Status TEXT, Series TEXT, \ WorkPage TEXT, Manual TEXT)') myDB.action( 'INSERT INTO temp_table SELECT AuthorID, AuthorName, AuthorLink, BookName, BookSub, \ BookDesc, BookGenre, BookIsbn, BookPub, BookRate, BookImg, BookPages, BookLink, BookID, \ BookFile, BookDate, BookLang, BookAdded, Status, Series, WorkPage, Manual from books' ) myDB.action('DROP TABLE books') myDB.action('ALTER TABLE temp_table RENAME TO books') lazylibrarian.UPDATE_MSG = 'Reorganisation of books table complete' if db_version < 16: if has_column(myDB, "books", "AuthorLink"): lazylibrarian.UPDATE_MSG = 'Removing series, authorlink and authorname from books table' myDB.action( 'CREATE TABLE IF NOT EXISTS temp_table (AuthorID TEXT, \ BookName TEXT, BookSub TEXT, BookDesc TEXT, BookGenre TEXT, BookIsbn TEXT, BookPub TEXT, \ BookRate INTEGER, BookImg TEXT, BookPages INTEGER, BookLink TEXT, BookID TEXT UNIQUE, \ BookFile TEXT, BookDate TEXT, BookLang TEXT, BookAdded TEXT, Status TEXT, WorkPage TEXT, \ Manual TEXT)') myDB.action( 'INSERT INTO temp_table SELECT AuthorID, BookName, BookSub, \ BookDesc, BookGenre, BookIsbn, BookPub, BookRate, BookImg, BookPages, BookLink, BookID, \ BookFile, BookDate, BookLang, BookAdded, Status, WorkPage, Manual from books' ) myDB.action('DROP TABLE books') myDB.action('ALTER TABLE temp_table RENAME TO books') lazylibrarian.UPDATE_MSG = 'Reorganisation of books table complete' if db_version < 17: if has_column(myDB, "series", "AuthorID"): lazylibrarian.UPDATE_MSG = 'Creating seriesauthors table' # In this version of the database there is only one author per series so use that as starting point myDB.action( 'CREATE TABLE seriesauthors (SeriesID INTEGER, AuthorID TEXT, UNIQUE (SeriesID,AuthorID))' ) series = myDB.select( 'SELECT SeriesID,AuthorID from series') cnt = 0 tot = len(series) for item in series: cnt += 1 lazylibrarian.UPDATE_MSG = "Updating seriesauthors: %s of %s" % ( cnt, tot) myDB.action( 'insert into seriesauthors (SeriesID, AuthorID) values (%s, %s)' % (item['SeriesID'], item['AuthorID']), suppress='UNIQUE') myDB.action('DROP TABLE temp_table') myDB.action( 'CREATE TABLE temp_table (SeriesID INTEGER PRIMARY KEY, SeriesName TEXT, \ Status TEXT)') myDB.action( 'INSERT INTO temp_table SELECT SeriesID, SeriesName, Status FROM series' ) myDB.action('DROP TABLE series') myDB.action('ALTER TABLE temp_table RENAME TO series') lazylibrarian.UPDATE_MSG = 'Reorganisation of series table complete' if db_version < 18: data = myDB.match('pragma index_list(seriesauthors)') if not data: lazylibrarian.UPDATE_MSG = 'Adding unique constraint to seriesauthors table' myDB.action('DROP TABLE IF EXISTS temp_table') myDB.action( 'ALTER TABLE seriesauthors RENAME to temp_table') myDB.action( 'CREATE TABLE seriesauthors (SeriesID INTEGER, AuthorID TEXT, UNIQUE (SeriesID,AuthorID))' ) series = myDB.select( 'SELECT SeriesID,AuthorID from temp_table') cnt = 0 tot = len(series) for item in series: cnt += 1 lazylibrarian.UPDATE_MSG = "Updating seriesauthors: %s of %s" % ( cnt, tot) myDB.action( 'insert into seriesauthors (SeriesID, AuthorID) values (%s, %s)' % (item['SeriesID'], item['AuthorID']), suppress='UNIQUE') myDB.action('DROP TABLE temp_table') lazylibrarian.UPDATE_MSG = 'Reorganisation of seriesauthors complete' # Now do any non-version-specific tidying try: authors = myDB.select( 'SELECT AuthorID FROM authors WHERE AuthorName IS NULL') if authors: logger.debug( 'Removing %s un-named author%s from database' % (len(authors), plural(len(authors)))) for author in authors: authorid = author["AuthorID"] myDB.action('DELETE from authors WHERE AuthorID="%s"' % authorid) myDB.action('DELETE from books WHERE AuthorID="%s"' % authorid) except Exception as e: logger.error('Error: ' + str(e)) myDB.action('PRAGMA user_version = %s' % db_current_version) lazylibrarian.UPDATE_MSG = 'Cleaning Database after upgrade' myDB.action('vacuum') lazylibrarian.UPDATE_MSG = 'Database updated to version %s' % db_current_version logger.info(lazylibrarian.UPDATE_MSG) restartJobs(start='Start') lazylibrarian.UPDATE_MSG = '' except Exception: logger.error('Unhandled exception in database update: %s' % traceback.format_exc()) lazylibrarian.UPDATE_MSG = ''
def get_author_books(self, authorid=None, authorname=None, refresh=False): api_hits = 0 gr_lang_hits = 0 lt_lang_hits = 0 gb_lang_change = 0 cache_hits = 0 not_cached = 0 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + urllib.urlencode( self.params) # Artist is loading myDB = database.DBConnection() controlValueDict = {"AuthorID": authorid} newValueDict = {"Status": "Loading"} myDB.upsert("authors", newValueDict, controlValueDict) books_dict = [] try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) except Exception as e: logger.error("Error fetching author books: %s" % e) return books_dict if rootxml is None: logger.debug("Error requesting author books") return books_dict if not in_cache: api_hits = api_hits + 1 resultxml = rootxml.getiterator('book') valid_langs = ([ valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',') ]) if not len(resultxml): logger.warn('[%s] No books found for author with ID: %s' % (authorname, authorid)) else: logger.debug("[%s] Now processing books with GoodReads API" % authorname) resultsCount = 0 removedResults = 0 duplicates = 0 ignored = 0 added_count = 0 updated_count = 0 book_ignore_count = 0 total_count = 0 logger.debug(u"url " + URL) authorNameResult = rootxml.find('./author/name').text logger.debug(u"author name " + authorNameResult) loopCount = 1 while resultxml is not None: for book in resultxml: total_count = total_count + 1 if (book.find('publication_year').text is None): pubyear = "0000" else: pubyear = book.find('publication_year').text try: bookimg = book.find('image_url').text if ('nocover' in bookimg): bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' # PAB this next section tries to get the book language using the isbn13 to look it up. If no isbn13 we skip the # book entirely, rather than including it with an "Unknown" language. Changed this so we can still include the book # with language set to "Unknown". There is a setting in config.ini to allow or skip books with "Unknown" language # if you really don't want to include them. # Not all GR books have isbn13 filled in, but all have a GR bookid, which we've already got, so use that. # Also, with GR API rules we can only call the API once per second, which slows us down a lot when all we want # is to get the language. We sleep for one second per book that GR knows about for each author you have in your # library. The libraryThing API has the same 1 second restriction, and is limited to 1000 hits per day, but has # fewer books with unknown language. To get around this and speed up the process, see if we already have a book # in the database with a similar start to the ISBN. The way ISBNs work, digits 3-5 of a 13 char ISBN or digits 0-2 # of a 10 digit ISBN indicate the region/language so if two books have the same 3 digit isbn code, they _should_ # be the same language. # I ran a simple python script on my library of 1500 books, and these codes were 100% correct on matching book # languages, no mis-matches. It did result in a small number of books with "unknown" language being wrongly matched # but most "unknown" were matched to the correct language. # We could look up ISBNs we already know about in the database, but this only holds books in the languages we want # to keep, which reduces the number of cache hits, so we create a new database table, holding ALL results including # the ISBNs for languages we don't want and books we reject. # The new table is created (if not exists) in init.py so by the time we get here there is an existing table. # If we haven't an already matching partial ISBN, look up language code from libraryThing # "http://www.librarything.com/api/thingLang.php?isbn=1234567890" # If you find a matching language, add it to the database. If "unknown" or "invalid", try GR as maybe GR can # provide a match. # If both LT and GR return unknown, add isbn to db as "unknown". No point in repeatedly asking LT for a code # it's told you it doesn't know. # As an extra option, if language includes "All" in config.ini, we can skip this whole section and process # everything much faster by not querying for language at all. # It does mean we include a lot of unwanted foreign translations in the database, but it's _much_ faster. bookLanguage = "Unknown" find_field = "id" isbn = "" isbnhead = "" if "All" not in valid_langs: # do we care about language if (book.find('isbn').text is not None): find_field = "isbn" isbn = book.find('isbn').text isbnhead = isbn[0:3] else: if (book.find('isbn13').text is not None): find_field = "isbn13" isbn = book.find('isbn13').text isbnhead = isbn[3:6] if (find_field != 'id'): # isbn or isbn13 found match = myDB.action( 'SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone() if (match): bookLanguage = match['lang'] cache_hits = cache_hits + 1 logger.debug( "Found cached language [%s] for %s [%s]" % (bookLanguage, find_field, isbnhead)) else: # no match in cache, try searching librarything for a language code using the isbn # if no language found, librarything return value is "invalid" or "unknown" # returns plain text, not xml BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + isbn try: librarything_wait() resp = urllib2.urlopen(BOOK_URL, timeout=30).read() lt_lang_hits = lt_lang_hits + 1 logger.debug( "LibraryThing reports language [%s] for %s" % (resp, isbnhead)) if ('invalid' in resp or 'Unknown' in resp): find_field = "id" # reset the field to force search on goodreads else: bookLanguage = resp # found a language code myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug(u"LT language %s: %s" % (isbnhead, bookLanguage)) except Exception as e: logger.error( "Error finding LT language result for [%s], %s" % (isbn, e)) find_field = "id" # reset the field to search on goodreads if (find_field == 'id'): # [or bookLanguage == "Unknown"] no earlier match, we'll have to search the goodreads api try: if (book.find(find_field).text is not None): BOOK_URL = 'http://www.goodreads.com/book/show?id=' + \ book.find(find_field).text + '&' + urllib.urlencode(self.params) logger.debug(u"Book URL: " + BOOK_URL) try: time_now = int(time.time()) if time_now <= lazylibrarian.LAST_GOODREADS: time.sleep(1) BOOK_rootxml, in_cache = get_xml_request( BOOK_URL) if BOOK_rootxml is None: logger.debug( 'Error requesting book language code' ) bookLanguage = "" else: if not in_cache: # only update last_goodreads if the result wasn't found in the cache lazylibrarian.LAST_GOODREADS = time_now bookLanguage = BOOK_rootxml.find( './book/language_code').text except Exception as e: logger.error( "Error finding book results: %s" % e) if not in_cache: gr_lang_hits = gr_lang_hits + 1 if not bookLanguage: bookLanguage = "Unknown" if (isbnhead != ""): # GR didn't give an isbn so we can't cache it, just use language for this book myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug( "GoodReads reports language [%s] for %s" % (bookLanguage, isbnhead)) else: not_cached = not_cached + 1 logger.debug(u"GR language: " + bookLanguage) else: logger.debug( "No %s provided for [%s]" % (find_field, book.find('title').text)) # continue except Exception as e: logger.debug(u"An error has occured: %s" % e) if bookLanguage not in valid_langs: logger.debug('Skipped a book with language %s' % bookLanguage) ignored = ignored + 1 continue bookname = book.find('title').text bookid = book.find('id').text bookdesc = book.find('description').text bookisbn = book.find('isbn').text bookpub = book.find('publisher').text booklink = book.find('link').text bookrate = float(book.find('average_rating').text) bookpages = book.find('num_pages').text bookname = unaccented(bookname) if ': ' in bookname: parts = bookname.split(': ', 1) bookname = parts[0] booksub = parts[1] else: booksub = '' dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booksub = replace_all(booksub, dic) booksub = booksub.strip() # strip whitespace if booksub: series, seriesNum = bookSeries(booksub) else: series, seriesNum = bookSeries(bookname) # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions) # and sometimes uses the same bookid if the book is the same but the title is slightly different # We use bookid, then reject if another author/title has a different bookid so we just keep one... find_book_status = myDB.select( 'SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_book_status: for resulted in find_book_status: book_status = resulted['Status'] locked = resulted['Manual'] else: book_status = lazylibrarian.NEWBOOK_STATUS locked = False rejected = False if re.match('[^\w-]', bookname ): # reject books with bad characters in title logger.debug(u"removed result [" + bookname + "] for bad characters") removedResults = removedResults + 1 rejected = True if not rejected and not bookname: logger.debug( 'Rejecting bookid %s for %s, no bookname' % (bookid, authorNameResult)) removedResults = removedResults + 1 rejected = True if not rejected: find_books = myDB.select( 'SELECT * FROM books WHERE BookName = "%s" and AuthorName = "%s"' % (bookname, authorNameResult)) if find_books: for find_book in find_books: if find_book['BookID'] != bookid: # we have a book with this author/title already logger.debug( 'Rejecting bookid %s for [%s][%s] already got %s' % (find_book['BookID'], authorNameResult, bookname, bookid)) duplicates = duplicates + 1 rejected = True break if not rejected: find_books = myDB.select( 'SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_books: # we have a book with this bookid already logger.debug( 'Rejecting bookid %s for [%s][%s] already got this bookid in database' % (bookid, authorNameResult, bookname)) duplicates = duplicates + 1 rejected = True break if not rejected: if book_status != "Ignored": if not locked: controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorNameResult, "AuthorID": authorid, "AuthorLink": None, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": None, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": pubyear, "BookLang": bookLanguage, "Status": book_status, "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } resultsCount = resultsCount + 1 myDB.upsert("books", newValueDict, controlValueDict) logger.debug(u"Book found: " + book.find('title').text + " " + pubyear) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug( u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link is not None: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum == None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = { "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict) if not find_book_status: logger.debug(u"[%s] Added book: %s" % (authorname, bookname)) added_count = added_count + 1 else: logger.debug(u"[%s] Updated book: %s" % (authorname, bookname)) updated_count = updated_count + 1 else: book_ignore_count = book_ignore_count + 1 loopCount = loopCount + 1 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + \ urllib.urlencode(self.params) + '&page=' + str(loopCount) resultxml = None try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) if rootxml is None: logger.debug('Error requesting next page of results') else: resultxml = rootxml.getiterator('book') if not in_cache: api_hits = api_hits + 1 except Exception as e: resultxml = None logger.error("Error finding next page of results: %s" % e) if resultxml is not None: if all(False for book in resultxml): # returns True if iterator is empty resultxml = None lastbook = myDB.action( 'SELECT BookName, BookLink, BookDate from books WHERE AuthorID="%s" \ AND Status != "Ignored" order by BookDate DESC' % authorid).fetchone() if lastbook: lastbookname = lastbook['BookName'] lastbooklink = lastbook['BookLink'] lastbookdate = lastbook['BookDate'] else: lastbookname = None lastbooklink = None lastbookdate = None controlValueDict = {"AuthorID": authorid} newValueDict = { "Status": "Active", "LastBook": lastbookname, "LastLink": lastbooklink, "LastDate": lastbookdate } myDB.upsert("authors", newValueDict, controlValueDict) # This is here because GoodReads sometimes has several entries with the same BookID! modified_count = added_count + updated_count logger.debug("Found %s total book%s for author" % (total_count, plural(total_count))) logger.debug("Removed %s bad language result%s for author" % (ignored, plural(ignored))) logger.debug( "Removed %s bad character or no-name result%s for author" % (removedResults, plural(removedResults))) logger.debug("Removed %s duplicate result%s for author" % (duplicates, plural(duplicates))) logger.debug("Ignored %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count))) logger.debug("Imported/Updated %s book%s for author" % (modified_count, plural(modified_count))) myDB.action( 'insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' % (authorname, api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change, cache_hits, ignored, removedResults, not_cached, duplicates)) if refresh: logger.info( "[%s] Book processing complete: Added %s book%s / Updated %s book%s" % (authorname, added_count, plural(added_count), updated_count, plural(updated_count))) else: logger.info( "[%s] Book processing complete: Added %s book%s to the database" % (authorname, added_count, plural(added_count))) return books_dict
def find_book(self, bookid=None, queue=None): myDB = database.DBConnection() URL = 'https://www.goodreads.com/book/show/' + bookid + '?' + urllib.urlencode(self.params) try: rootxml, in_cache = get_xml_request(URL) if rootxml is None: logger.debug("Error requesting book") return except Exception as e: logger.error("Error finding book: %s" % e) return bookLanguage = rootxml.find('./book/language_code').text bookname = rootxml.find('./book/title').text if not bookLanguage: bookLanguage = "Unknown" # # PAB user has said they want this book, don't block for bad language, just warn # valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if bookLanguage not in valid_langs: logger.debug('Book %s language does not match preference' % bookname) if (rootxml.find('./book/publication_year').text is None): bookdate = "0000" else: bookdate = rootxml.find('./book/publication_year').text try: bookimg = rootxml.find('./book/img_url').text if 'assets/nocover' in bookimg: bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' authorname = rootxml.find('./book/authors/author/name').text bookdesc = rootxml.find('./book/description').text bookisbn = rootxml.find('./book/isbn').text bookpub = rootxml.find('./book/publisher').text booklink = rootxml.find('./book/link').text bookrate = float(rootxml.find('./book/average_rating').text) bookpages = rootxml.find('.book/num_pages').text name = authorname GR = GoodReads(name) author = GR.find_author_id() if author: AuthorID = author['authorid'] booksub = '' bookname = unaccented(bookname) if ': ' in bookname: parts = bookname.split(': ', 1) bookname = parts[0] booksub = parts[1] dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booksub = replace_all(booksub, dic) booksub = booksub.strip() # strip whitespace if booksub: series,seriesNum = bookSeries(booksub) else: series,seriesNum = bookSeries(bookname) controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorname, "AuthorID": AuthorID, "AuthorLink": None, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": None, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": bookLanguage, "Status": "Wanted", "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) logger.debug("%s added to the books database" % bookname) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link is not None: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum == None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = { "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict)
def get_author_books(self, authorid=None, authorname=None, refresh=False): api_hits = 0 gr_lang_hits = 0 lt_lang_hits = 0 gb_lang_change = 0 cache_hits = 0 not_cached = 0 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + urllib.urlencode(self.params) # Artist is loading myDB = database.DBConnection() controlValueDict = {"AuthorID": authorid} newValueDict = {"Status": "Loading"} myDB.upsert("authors", newValueDict, controlValueDict) books_dict = [] try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) except Exception as e: logger.error("Error fetching author books: %s" % e) return books_dict if rootxml is None: logger.debug("Error requesting author books") return books_dict if not in_cache: api_hits = api_hits + 1 resultxml = rootxml.getiterator('book') valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if not len(resultxml): logger.warn('[%s] No books found for author with ID: %s' % (authorname, authorid)) else: logger.debug("[%s] Now processing books with GoodReads API" % authorname) resultsCount = 0 removedResults = 0 duplicates = 0 ignored = 0 added_count = 0 updated_count = 0 book_ignore_count = 0 total_count = 0 logger.debug(u"url " + URL) authorNameResult = rootxml.find('./author/name').text logger.debug(u"author name " + authorNameResult) loopCount = 1 while resultxml is not None: for book in resultxml: total_count = total_count + 1 if (book.find('publication_year').text is None): pubyear = "0000" else: pubyear = book.find('publication_year').text try: bookimg = book.find('image_url').text if ('nocover' in bookimg): bookimg = 'images/nocover.png' except (KeyError,AttributeError): bookimg = 'images/nocover.png' # PAB this next section tries to get the book language using the isbn13 to look it up. If no isbn13 we skip the # book entirely, rather than including it with an "Unknown" language. Changed this so we can still include the book # with language set to "Unknown". There is a setting in config.ini to allow or skip books with "Unknown" language # if you really don't want to include them. # Not all GR books have isbn13 filled in, but all have a GR bookid, which we've already got, so use that. # Also, with GR API rules we can only call the API once per second, which slows us down a lot when all we want # is to get the language. We sleep for one second per book that GR knows about for each author you have in your # library. The libraryThing API has the same 1 second restriction, and is limited to 1000 hits per day, but has # fewer books with unknown language. To get around this and speed up the process, see if we already have a book # in the database with a similar start to the ISBN. The way ISBNs work, digits 3-5 of a 13 char ISBN or digits 0-2 # of a 10 digit ISBN indicate the region/language so if two books have the same 3 digit isbn code, they _should_ # be the same language. # I ran a simple python script on my library of 1500 books, and these codes were 100% correct on matching book # languages, no mis-matches. It did result in a small number of books with "unknown" language being wrongly matched # but most "unknown" were matched to the correct language. # We could look up ISBNs we already know about in the database, but this only holds books in the languages we want # to keep, which reduces the number of cache hits, so we create a new database table, holding ALL results including # the ISBNs for languages we don't want and books we reject. # The new table is created (if not exists) in init.py so by the time we get here there is an existing table. # If we haven't an already matching partial ISBN, look up language code from libraryThing # "http://www.librarything.com/api/thingLang.php?isbn=1234567890" # If you find a matching language, add it to the database. If "unknown" or "invalid", try GR as maybe GR can # provide a match. # If both LT and GR return unknown, add isbn to db as "unknown". No point in repeatedly asking LT for a code # it's told you it doesn't know. # As an extra option, if language includes "All" in config.ini, we can skip this whole section and process # everything much faster by not querying for language at all. # It does mean we include a lot of unwanted foreign translations in the database, but it's _much_ faster. bookLanguage = "Unknown" find_field = "id" isbn = "" isbnhead = "" if "All" not in valid_langs: # do we care about language if (book.find('isbn').text is not None): find_field = "isbn" isbn = book.find('isbn').text isbnhead = isbn[0:3] else: if (book.find('isbn13').text is not None): find_field = "isbn13" isbn = book.find('isbn13').text isbnhead = isbn[3:6] if (find_field != 'id'): # isbn or isbn13 found match = myDB.action('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone() if (match): bookLanguage = match['lang'] cache_hits = cache_hits + 1 logger.debug("Found cached language [%s] for %s [%s]" % (bookLanguage, find_field, isbnhead)) else: # no match in cache, try searching librarything for a language code using the isbn # if no language found, librarything return value is "invalid" or "unknown" # returns plain text, not xml BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + isbn try: librarything_wait() resp = urllib2.urlopen(BOOK_URL, timeout=30).read() lt_lang_hits = lt_lang_hits + 1 logger.debug("LibraryThing reports language [%s] for %s" % (resp, isbnhead)) if ('invalid' in resp or 'Unknown' in resp): find_field = "id" # reset the field to force search on goodreads else: bookLanguage = resp # found a language code myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug(u"LT language %s: %s" % (isbnhead, bookLanguage)) except Exception as e: logger.error("Error finding LT language result for [%s], %s" % (isbn, e)) find_field = "id" # reset the field to search on goodreads if (find_field == 'id'): # [or bookLanguage == "Unknown"] no earlier match, we'll have to search the goodreads api try: if (book.find(find_field).text is not None): BOOK_URL = 'http://www.goodreads.com/book/show?id=' + \ book.find(find_field).text + '&' + urllib.urlencode(self.params) logger.debug(u"Book URL: " + BOOK_URL) try: time_now = int(time.time()) if time_now <= lazylibrarian.LAST_GOODREADS: time.sleep(1) BOOK_rootxml, in_cache = get_xml_request(BOOK_URL) if BOOK_rootxml is None: logger.debug('Error requesting book language code') bookLanguage = "" else: if not in_cache: # only update last_goodreads if the result wasn't found in the cache lazylibrarian.LAST_GOODREADS = time_now bookLanguage = BOOK_rootxml.find('./book/language_code').text except Exception as e: logger.error("Error finding book results: %s" % e) if not in_cache: gr_lang_hits = gr_lang_hits + 1 if not bookLanguage: bookLanguage = "Unknown" if (isbnhead != ""): # GR didn't give an isbn so we can't cache it, just use language for this book myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug("GoodReads reports language [%s] for %s" % (bookLanguage, isbnhead)) else: not_cached = not_cached + 1 logger.debug(u"GR language: " + bookLanguage) else: logger.debug("No %s provided for [%s]" % (find_field, book.find('title').text)) # continue except Exception as e: logger.debug(u"An error has occured: %s" % e) if bookLanguage not in valid_langs: logger.debug('Skipped a book with language %s' % bookLanguage) ignored = ignored + 1 continue bookname = book.find('title').text bookid = book.find('id').text bookdesc = book.find('description').text bookisbn = book.find('isbn').text bookpub = book.find('publisher').text booklink = book.find('link').text bookrate = float(book.find('average_rating').text) bookpages = book.find('num_pages').text bookname = unaccented(bookname) if ': ' in bookname: parts = bookname.split(': ', 1) bookname = parts[0] booksub = parts[1] else: booksub = '' dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booksub = replace_all(booksub, dic) booksub = booksub.strip() # strip whitespace if booksub: series,seriesNum = bookSeries(booksub) else: series,seriesNum = bookSeries(bookname) # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions) # and sometimes uses the same bookid if the book is the same but the title is slightly different # We use bookid, then reject if another author/title has a different bookid so we just keep one... find_book_status = myDB.select('SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_book_status: for resulted in find_book_status: book_status = resulted['Status'] locked = resulted ['Manual'] else: book_status = lazylibrarian.NEWBOOK_STATUS locked = False rejected = False if re.match('[^\w-]', bookname): # reject books with bad characters in title logger.debug(u"removed result [" + bookname + "] for bad characters") removedResults = removedResults + 1 rejected = True if not rejected and not bookname: logger.debug('Rejecting bookid %s for %s, no bookname' % (bookid, authorNameResult)) removedResults = removedResults + 1 rejected = True if not rejected: find_books = myDB.select('SELECT * FROM books WHERE BookName = "%s" and AuthorName = "%s"' % (bookname, authorNameResult)) if find_books: for find_book in find_books: if find_book['BookID'] != bookid: # we have a book with this author/title already logger.debug('Rejecting bookid %s for [%s][%s] already got %s' % (find_book['BookID'], authorNameResult, bookname, bookid)) duplicates = duplicates + 1 rejected = True break if not rejected: find_books = myDB.select('SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_books: # we have a book with this bookid already logger.debug('Rejecting bookid %s for [%s][%s] already got this bookid in database' % (bookid, authorNameResult, bookname)) duplicates = duplicates + 1 rejected = True break if not rejected: if book_status != "Ignored": if not locked: controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorNameResult, "AuthorID": authorid, "AuthorLink": None, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": None, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": pubyear, "BookLang": bookLanguage, "Status": book_status, "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } resultsCount = resultsCount + 1 myDB.upsert("books", newValueDict, controlValueDict) logger.debug(u"Book found: " + book.find('title').text + " " + pubyear) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link is not None: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum == None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = { "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict) if not find_book_status: logger.debug(u"[%s] Added book: %s" % (authorname, bookname)) added_count = added_count + 1 else: logger.debug(u"[%s] Updated book: %s" % (authorname, bookname)) updated_count = updated_count + 1 else: book_ignore_count = book_ignore_count + 1 loopCount = loopCount + 1 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + \ urllib.urlencode(self.params) + '&page=' + str(loopCount) resultxml = None try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) if rootxml is None: logger.debug('Error requesting next page of results') else: resultxml = rootxml.getiterator('book') if not in_cache: api_hits = api_hits + 1 except Exception as e: resultxml = None logger.error("Error finding next page of results: %s" % e) if resultxml is not None: if all(False for book in resultxml): # returns True if iterator is empty resultxml = None lastbook = myDB.action('SELECT BookName, BookLink, BookDate from books WHERE AuthorID="%s" \ AND Status != "Ignored" order by BookDate DESC' % authorid).fetchone() if lastbook: lastbookname = lastbook['BookName'] lastbooklink = lastbook['BookLink'] lastbookdate = lastbook['BookDate'] else: lastbookname = None lastbooklink = None lastbookdate = None controlValueDict = {"AuthorID": authorid} newValueDict = { "Status": "Active", "LastBook": lastbookname, "LastLink": lastbooklink, "LastDate": lastbookdate } myDB.upsert("authors", newValueDict, controlValueDict) # This is here because GoodReads sometimes has several entries with the same BookID! modified_count = added_count + updated_count logger.debug("Found %s total book%s for author" % (total_count, plural(total_count))) logger.debug("Removed %s bad language result%s for author" % (ignored, plural(ignored))) logger.debug("Removed %s bad character or no-name result%s for author" % (removedResults, plural(removedResults))) logger.debug("Removed %s duplicate result%s for author" % (duplicates, plural(duplicates))) logger.debug("Ignored %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count))) logger.debug("Imported/Updated %s book%s for author" % (modified_count, plural(modified_count))) myDB.action('insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' % (authorname, api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change, cache_hits, ignored, removedResults, not_cached, duplicates)) if refresh: logger.info("[%s] Book processing complete: Added %s book%s / Updated %s book%s" % (authorname, added_count, plural(added_count), updated_count, plural(updated_count))) else: logger.info("[%s] Book processing complete: Added %s book%s to the database" % (authorname, added_count, plural(added_count))) return books_dict
def find_book(self, bookid=None, queue=None): myDB = database.DBConnection() URL = 'https://www.goodreads.com/book/show/' + bookid + '?' + urllib.urlencode(self.params) try: rootxml, in_cache = get_xml_request(URL) if rootxml is None: logger.debug("Error requesting book") return except Exception as e: logger.error("Error finding book: %s" % str(e)) return bookLanguage = rootxml.find('./book/language_code').text bookname = rootxml.find('./book/title').text if not bookLanguage: bookLanguage = "Unknown" # # PAB user has said they want this book, don't block for unwanted language, just warn # valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG']) if bookLanguage not in valid_langs: logger.debug('Book %s goodreads language does not match preference, %s' % (bookname, bookLanguage)) if rootxml.find('./book/publication_year').text is None: bookdate = "0000" else: bookdate = rootxml.find('./book/publication_year').text try: bookimg = rootxml.find('./book/img_url').text if 'assets/nocover' in bookimg: bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' authorname = rootxml.find('./book/authors/author/name').text bookdesc = rootxml.find('./book/description').text bookisbn = rootxml.find('./book/isbn').text bookpub = rootxml.find('./book/publisher').text booklink = rootxml.find('./book/link').text bookrate = float(rootxml.find('./book/average_rating').text) bookpages = rootxml.find('.book/num_pages').text name = authorname GR = GoodReads(name) author = GR.find_author_id() if author: AuthorID = author['authorid'] match = myDB.match('SELECT AuthorID from authors WHERE AuthorID="%s"' % AuthorID) if not match: match = myDB.match('SELECT AuthorID from authors WHERE AuthorName="%s"' % author['authorname']) if match: logger.debug('%s: Changing authorid from %s to %s' % (author['authorname'], AuthorID, match['AuthorID'])) AuthorID = match['AuthorID'] # we have a different authorid for that authorname else: # no author but request to add book, add author as "ignored" # User hit "add book" button from a search controlValueDict = {"AuthorID": AuthorID} newValueDict = { "AuthorName": author['authorname'], "AuthorImg": author['authorimg'], "AuthorLink": author['authorlink'], "AuthorBorn": author['authorborn'], "AuthorDeath": author['authordeath'], "DateAdded": today(), "Status": "Ignored" } myDB.upsert("authors", newValueDict, controlValueDict) else: logger.warn("No AuthorID for %s, unable to add book %s" % (authorname, bookname)) return bookname = unaccented(bookname) bookname, booksub = split_title(authorname, bookname) dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(bookname, dic).strip() booksub = replace_all(booksub, dic).strip() if booksub: series, seriesNum = bookSeries(booksub) else: series, seriesNum = bookSeries(bookname) controlValueDict = {"BookID": bookid} newValueDict = { "AuthorID": AuthorID, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": "", "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": bookLanguage, "Status": "Wanted", "BookAdded": today() } myDB.upsert("books", newValueDict, controlValueDict) logger.info("%s added to the books database" % bookname) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link, success = cache_img("book", bookid, bookimg) if success: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) else: logger.debug('Failed to cache image for %s' % bookimg) if lazylibrarian.CONFIG['ADD_SERIES']: # prefer series info from librarything seriesdict = getWorkSeries(bookid) if seriesdict: logger.debug(u'Updated series: %s [%s]' % (bookid, seriesdict)) else: if series: seriesdict = {cleanName(unaccented(series)): seriesNum} setSeries(seriesdict, bookid) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict)
def get_author_books(self, authorid=None, authorname=None, bookstatus="Skipped", refresh=False): try: api_hits = 0 gr_lang_hits = 0 lt_lang_hits = 0 gb_lang_change = 0 cache_hits = 0 not_cached = 0 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + urllib.urlencode(self.params) # Artist is loading myDB = database.DBConnection() controlValueDict = {"AuthorID": authorid} newValueDict = {"Status": "Loading"} myDB.upsert("authors", newValueDict, controlValueDict) try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) except Exception as e: logger.error("Error fetching author books: %s" % str(e)) return if rootxml is None: logger.debug("Error requesting author books") return if not in_cache: api_hits += 1 resultxml = rootxml.getiterator('book') valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG']) resultsCount = 0 removedResults = 0 duplicates = 0 ignored = 0 added_count = 0 updated_count = 0 book_ignore_count = 0 total_count = 0 if not len(resultxml): logger.warn('[%s] No books found for author with ID: %s' % (authorname, authorid)) else: logger.debug("[%s] Now processing books with GoodReads API" % authorname) logger.debug(u"url " + URL) authorNameResult = rootxml.find('./author/name').text # Goodreads sometimes puts extra whitepase in the author names! authorNameResult = ' '.join(authorNameResult.split()) logger.debug(u"GoodReads author name [%s]" % authorNameResult) loopCount = 1 while resultxml: for book in resultxml: total_count += 1 if book.find('publication_year').text is None: pubyear = "0000" else: pubyear = book.find('publication_year').text try: bookimg = book.find('image_url').text if 'nocover' in bookimg: bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' bookLanguage = "Unknown" find_field = "id" isbn = "" isbnhead = "" if "All" not in valid_langs: # do we care about language if book.find('isbn').text: find_field = "isbn" isbn = book.find('isbn').text isbnhead = isbn[0:3] else: if book.find('isbn13').text: find_field = "isbn13" isbn = book.find('isbn13').text isbnhead = isbn[3:6] # Try to use shortcut of ISBN identifier codes described here... # https://en.wikipedia.org/wiki/List_of_ISBN_identifier_groups if isbnhead: if find_field == "isbn13" and isbn.startswith('979'): for item in lazylibrarian.isbn_979_dict: if isbnhead.startswith(item): bookLanguage = lazylibrarian.isbn_979_dict[item] break if bookLanguage != "Unknown": logger.debug("ISBN979 returned %s for %s" % (bookLanguage, isbnhead)) elif (find_field == "isbn") or (find_field == "isbn13" and isbn.startswith('978')): for item in lazylibrarian.isbn_978_dict: if isbnhead.startswith(item): bookLanguage = lazylibrarian.isbn_978_dict[item] break if bookLanguage != "Unknown": logger.debug("ISBN978 returned %s for %s" % (bookLanguage, isbnhead)) if bookLanguage == "Unknown" and isbnhead: # Nothing in the isbn dictionary, try any cached results match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % isbnhead) if match: bookLanguage = match['lang'] cache_hits += 1 logger.debug("Found cached language [%s] for %s [%s]" % (bookLanguage, find_field, isbnhead)) else: # no match in cache, try searching librarything for a language code using the isbn # if no language found, librarything return value is "invalid" or "unknown" # returns plain text, not xml BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + isbn try: librarything_wait() resp = urllib2.urlopen(BOOK_URL, timeout=30).read() lt_lang_hits += 1 logger.debug("LibraryThing reports language [%s] for %s" % (resp, isbnhead)) if 'invalid' in resp or 'Unknown' in resp: bookLanguage = "Unknown" else: bookLanguage = resp # found a language code myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug(u"LT language %s: %s" % (isbnhead, bookLanguage)) except Exception as e: logger.error("Error finding LT language result for [%s], %s" % (isbn, str(e))) if bookLanguage == "Unknown": # still no earlier match, we'll have to search the goodreads api try: if book.find(find_field).text: BOOK_URL = 'http://www.goodreads.com/book/show?id=' + \ book.find(find_field).text + '&' + urllib.urlencode(self.params) logger.debug(u"Book URL: " + BOOK_URL) time_now = int(time.time()) if time_now <= lazylibrarian.LAST_GOODREADS: time.sleep(1) bookLanguage = "" try: BOOK_rootxml, in_cache = get_xml_request(BOOK_URL) if BOOK_rootxml is None: logger.debug('Error requesting book language code') else: if not in_cache: # only update last_goodreads if the result wasn't found in the cache lazylibrarian.LAST_GOODREADS = time_now try: bookLanguage = BOOK_rootxml.find('./book/language_code').text except Exception as e: logger.debug("Error finding language_code in book xml: %s" % str(e)) except Exception as e: logger.debug("Error getting book xml: %s" % str(e)) if not in_cache: gr_lang_hits += 1 if not bookLanguage: bookLanguage = "Unknown" # At this point, give up? # WhatWork on author/title doesn't give us a language. # It might give us the "original language" of the book (but not always) # and our copy might not be in the original language anyway # eg "The Girl With the Dragon Tattoo" original language Swedish # If we have an isbn, try WhatISBN to get alternatives # in case any of them give us a language, but it seems if thinglang doesn't # have a language for the first isbn code, it doesn't for any of the # alternatives either # Goodreads search results don't include the language. Although sometimes # it's in the html page, it's not in the xml results if isbnhead != "": # if GR didn't give an isbn we can't cache it, just use language for this book myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug("GoodReads reports language [%s] for %s" % (bookLanguage, isbnhead)) else: not_cached += 1 logger.debug(u"GR language: " + bookLanguage) else: logger.debug("No %s provided for [%s]" % (find_field, book.find('title').text)) # continue except Exception as e: logger.debug(u"Goodreads language search failed: %s" % str(e)) if bookLanguage not in valid_langs: logger.debug('Skipped %s with language %s' % (book.find('title').text, bookLanguage)) ignored += 1 continue bookname = book.find('title').text bookid = book.find('id').text bookdesc = book.find('description').text bookisbn = book.find('isbn').text bookpub = book.find('publisher').text booklink = book.find('link').text bookrate = float(book.find('average_rating').text) bookpages = book.find('num_pages').text bookname = unaccented(bookname) bookname, booksub = split_title(authorNameResult, bookname) dic = {':': '.', '"': ''} # do we need to strip apostrophes , '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booksub = replace_all(booksub, dic) booksub = booksub.strip() # strip whitespace if booksub: series, seriesNum = bookSeries(booksub) else: series, seriesNum = bookSeries(bookname) rejected = False check_status = False if re.match('[^\w-]', bookname): # reject books with bad characters in title logger.debug(u"removed result [" + bookname + "] for bad characters") removedResults += 1 rejected = True if not rejected and lazylibrarian.CONFIG['NO_FUTURE']: if pubyear > today()[:4]: logger.debug('Rejecting %s, future publication date %s' % (bookname, pubyear)) removedResults += 1 rejected = True if not rejected and not bookname: logger.debug('Rejecting bookid %s for %s, no bookname' % (bookid, authorNameResult)) removedResults += 1 rejected = True if not rejected: cmd = 'SELECT BookID FROM books,authors WHERE books.AuthorID = authors.AuthorID' cmd += ' and BookName = "%s" COLLATE NOCASE and AuthorName = "%s" COLLATE NOCASE' % \ (bookname, authorNameResult.replace('"', '""')) match = myDB.match(cmd) if match: if match['BookID'] != bookid: # we have a different book with this author/title already logger.debug('Rejecting bookid %s for [%s][%s] already got %s' % (match['BookID'], authorNameResult, bookname, bookid)) duplicates += 1 rejected = True if not rejected: cmd = 'SELECT AuthorName,BookName FROM books,authors' cmd += ' WHERE authors.AuthorID = books.AuthorID AND BookID=%s' % bookid match = myDB.match(cmd) if match: # we have a book with this bookid already if bookname != match['BookName'] or authorNameResult != match['AuthorName']: logger.debug('Rejecting bookid %s for [%s][%s] already got bookid for [%s][%s]' % (bookid, authorNameResult, bookname, match['AuthorName'], match['BookName'])) else: logger.debug('Rejecting bookid %s for [%s][%s] already got this book in database' % (bookid, authorNameResult, bookname)) check_status = True duplicates += 1 rejected = True if check_status or not rejected: existing_book = myDB.match('SELECT Status,Manual FROM books WHERE BookID = "%s"' % bookid) if existing_book: book_status = existing_book['Status'] locked = existing_book['Manual'] if locked is None: locked = False elif locked.isdigit(): locked = bool(int(locked)) else: book_status = bookstatus # new_book status, or new_author status locked = False # Is the book already in the database? # Leave alone if locked or status "ignore" if not locked and book_status != "Ignored": controlValueDict = {"BookID": bookid} newValueDict = { "AuthorID": authorid, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": "", "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": pubyear, "BookLang": bookLanguage, "Status": book_status, "BookAdded": today() } resultsCount += 1 updated = False myDB.upsert("books", newValueDict, controlValueDict) logger.debug(u"Book found: " + book.find('title').text + " " + pubyear) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) updated = True elif bookimg and bookimg.startswith('http'): link, success = cache_img("book", bookid, bookimg, refresh=refresh) if success: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) updated = True else: logger.debug('Failed to cache image for %s' % bookimg) seriesdict = {} if lazylibrarian.CONFIG['ADD_SERIES']: # prefer series info from librarything seriesdict = getWorkSeries(bookid) if seriesdict: logger.debug(u'Updated series: %s [%s]' % (bookid, seriesdict)) updated = True else: if series: seriesdict = {cleanName(unaccented(series)): seriesNum} setSeries(seriesdict, bookid) new_status = setStatus(bookid, seriesdict, bookstatus) if not new_status == book_status: book_status = new_status updated = True worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict) if not existing_book: logger.debug(u"[%s] Added book: %s [%s] status %s" % (authorname, bookname, bookLanguage, book_status)) added_count += 1 elif updated: logger.debug(u"[%s] Updated book: %s [%s] status %s" % (authorname, bookname, bookLanguage, book_status)) updated_count += 1 else: book_ignore_count += 1 loopCount += 1 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + \ urllib.urlencode(self.params) + '&page=' + str(loopCount) resultxml = None try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) if rootxml is None: logger.debug('Error requesting next page of results') else: resultxml = rootxml.getiterator('book') if not in_cache: api_hits += 1 except Exception as e: resultxml = None logger.error("Error finding next page of results: %s" % str(e)) if resultxml: if all(False for _ in resultxml): # returns True if iterator is empty resultxml = None deleteEmptySeries() lastbook = myDB.match('SELECT BookName, BookLink, BookDate, BookImg from books WHERE AuthorID="%s" \ AND Status != "Ignored" order by BookDate DESC' % authorid) if lastbook: lastbookname = lastbook['BookName'] lastbooklink = lastbook['BookLink'] lastbookdate = lastbook['BookDate'] lastbookimg = lastbook['BookImg'] else: lastbookname = "" lastbooklink = "" lastbookdate = "" lastbookimg = "" controlValueDict = {"AuthorID": authorid} newValueDict = { "Status": "Active", "LastBook": lastbookname, "LastLink": lastbooklink, "LastDate": lastbookdate, "LastBookImg": lastbookimg } myDB.upsert("authors", newValueDict, controlValueDict) # This is here because GoodReads sometimes has several entries with the same BookID! modified_count = added_count + updated_count logger.debug("Found %s result%s" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored))) logger.debug( "Removed %s bad character or no-name result%s" % (removedResults, plural(removedResults))) logger.debug("Removed %s duplicate result%s" % (duplicates, plural(duplicates))) logger.debug("Found %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count))) logger.debug("Imported/Updated %s book%s" % (modified_count, plural(modified_count))) myDB.action('insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' % (authorname.replace('"', '""'), api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change, cache_hits, ignored, removedResults, not_cached, duplicates)) if refresh: logger.info("[%s] Book processing complete: Added %s book%s / Updated %s book%s" % (authorname, added_count, plural(added_count), updated_count, plural(updated_count))) else: logger.info("[%s] Book processing complete: Added %s book%s to the database" % (authorname, added_count, plural(added_count))) except Exception: logger.error('Unhandled exception in GR.get_author_books: %s' % traceback.format_exc())
def get_author_books(self, authorid=None, authorname=None, refresh=False): try: api_hits = 0 gr_lang_hits = 0 lt_lang_hits = 0 gb_lang_change = 0 cache_hits = 0 not_cached = 0 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + urllib.urlencode(self.params) # Artist is loading myDB = database.DBConnection() controlValueDict = {"AuthorID": authorid} newValueDict = {"Status": "Loading"} myDB.upsert("authors", newValueDict, controlValueDict) try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) except Exception as e: logger.error("Error fetching author books: %s" % str(e)) return if rootxml is None: logger.debug("Error requesting author books") return if not in_cache: api_hits = api_hits + 1 resultxml = rootxml.getiterator('book') valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if not len(resultxml): logger.warn('[%s] No books found for author with ID: %s' % (authorname, authorid)) else: logger.debug("[%s] Now processing books with GoodReads API" % authorname) resultsCount = 0 removedResults = 0 duplicates = 0 ignored = 0 added_count = 0 updated_count = 0 book_ignore_count = 0 total_count = 0 logger.debug(u"url " + URL) authorNameResult = rootxml.find('./author/name').text logger.debug(u"author name " + authorNameResult) loopCount = 1 isbn_979_dict = { "10": "fre", "11": "kor", "12": "ita" } isbn_978_dict = { "0": "eng", "1": "eng", "2": "fre", "3": "ger", "4": "jap", "5": "rus" } while resultxml: for book in resultxml: total_count = total_count + 1 if (book.find('publication_year').text is None): pubyear = "0000" else: pubyear = book.find('publication_year').text try: bookimg = book.find('image_url').text if ('nocover' in bookimg): bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' bookLanguage = "Unknown" find_field = "id" isbn = "" isbnhead = "" if "All" not in valid_langs: # do we care about language if book.find('isbn').text: find_field = "isbn" isbn = book.find('isbn').text isbnhead = isbn[0:3] else: if book.find('isbn13').text: find_field = "isbn13" isbn = book.find('isbn13').text isbnhead = isbn[3:6] if (find_field != 'id'): # isbn10 or isbn13 found # Try to use shortcut of ISBN identifier codes described here... # https://en.wikipedia.org/wiki/List_of_ISBN_identifier_groups if isbnhead != "": if find_field == "isbn13" and isbn.startswith('979'): for item in isbn_979_dict: if isbnhead.startswith(item): bookLanguage = isbn_979_dict[item] break if bookLanguage != "Unknown": logger.debug("ISBN979 returned %s for %s" % (bookLanguage, isbnhead)) elif (find_field == "isbn") or (find_field == "isbn13" and isbn.startswith('978')): for item in isbn_978_dict: if isbnhead.startswith(item): bookLanguage = isbn_978_dict[item] break if bookLanguage != "Unknown": logger.debug("ISBN978 returned %s for %s" % (bookLanguage, isbnhead)) if bookLanguage == "Unknown": # Nothing in the isbn dictionary, try any cached results match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)) if match: bookLanguage = match['lang'] cache_hits = cache_hits + 1 logger.debug("Found cached language [%s] for %s [%s]" % (bookLanguage, find_field, isbnhead)) else: # no match in cache, try searching librarything for a language code using the isbn # if no language found, librarything return value is "invalid" or "unknown" # returns plain text, not xml BOOK_URL = 'http://www.librarything.com/api/thingLang.php?isbn=' + isbn try: librarything_wait() resp = urllib2.urlopen(BOOK_URL, timeout=30).read() lt_lang_hits = lt_lang_hits + 1 logger.debug("LibraryThing reports language [%s] for %s" % (resp, isbnhead)) if ('invalid' in resp or 'Unknown' in resp): bookLanguage = "Unknown" else: bookLanguage = resp # found a language code myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug(u"LT language %s: %s" % (isbnhead, bookLanguage)) except Exception as e: logger.error("Error finding LT language result for [%s], %s" % (isbn, str(e))) if bookLanguage == "Unknown": # still no earlier match, we'll have to search the goodreads api try: if book.find(find_field).text: BOOK_URL = 'http://www.goodreads.com/book/show?id=' + \ book.find(find_field).text + '&' + urllib.urlencode(self.params) logger.debug(u"Book URL: " + BOOK_URL) try: time_now = int(time.time()) if time_now <= lazylibrarian.LAST_GOODREADS: time.sleep(1) BOOK_rootxml, in_cache = get_xml_request(BOOK_URL) if BOOK_rootxml is None: logger.debug('Error requesting book language code') bookLanguage = "" else: if not in_cache: # only update last_goodreads if the result wasn't found in the cache lazylibrarian.LAST_GOODREADS = time_now bookLanguage = BOOK_rootxml.find('./book/language_code').text except Exception as e: logger.error("Error finding book results: %s" % str(e)) if not in_cache: gr_lang_hits = gr_lang_hits + 1 if not bookLanguage: bookLanguage = "Unknown" # At this point, give up? # WhatWork on author/title doesn't give us a language. # It might give us the "original language" of the book (but not always) # and our copy might not be in the original language anyway # eg "The Girl With the Dragon Tattoo" original language Swedish # If we have an isbn, try WhatISBN to get alternatives # in case any of them give us a language, but it seems if thinglang doesn't # have a language for the first isbn code, it doesn't for any of the # alternatives either # Goodreads search results don't include the language. Although sometimes # it's in the html page, it's not in the xml results if (isbnhead != ""): # if GR didn't give an isbn we can't cache it, just use language for this book myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, bookLanguage)) logger.debug("GoodReads reports language [%s] for %s" % (bookLanguage, isbnhead)) else: not_cached = not_cached + 1 logger.debug(u"GR language: " + bookLanguage) else: logger.debug("No %s provided for [%s]" % (find_field, book.find('title').text)) # continue except Exception as e: logger.debug(u"Goodreads language search failed: %s" % str(e)) if bookLanguage not in valid_langs: logger.debug('Skipped %s with language %s' % (book.find('title').text, bookLanguage)) ignored = ignored + 1 continue bookname = book.find('title').text bookid = book.find('id').text bookdesc = book.find('description').text bookisbn = book.find('isbn').text bookpub = book.find('publisher').text booklink = book.find('link').text bookrate = float(book.find('average_rating').text) bookpages = book.find('num_pages').text bookname = unaccented(bookname) bookname, booksub = split_title(authorNameResult, bookname) dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = bookname.strip() # strip whitespace booksub = replace_all(booksub, dic) booksub = booksub.strip() # strip whitespace if booksub: series, seriesNum = bookSeries(booksub) else: series, seriesNum = bookSeries(bookname) # GoodReads sometimes has multiple bookids for the same book (same author/title, different editions) # and sometimes uses the same bookid if the book is the same but the title is slightly different # We use bookid, then reject if another author/title has a different bookid so we just keep one... find_book_status = myDB.select('SELECT * FROM books WHERE BookID = "%s"' % bookid) if find_book_status: for resulted in find_book_status: book_status = resulted['Status'] locked = resulted['Manual'] else: book_status = lazylibrarian.NEWBOOK_STATUS locked = False rejected = False if re.match('[^\w-]', bookname): # reject books with bad characters in title logger.debug(u"removed result [" + bookname + "] for bad characters") removedResults = removedResults + 1 rejected = True if not rejected and not bookname: logger.debug('Rejecting bookid %s for %s, no bookname' % (bookid, authorNameResult)) removedResults = removedResults + 1 rejected = True if not rejected: find_books = myDB.select('SELECT * FROM books WHERE BookName = "%s" and AuthorName = "%s"' % (bookname, authorNameResult.replace('"', '""'))) if find_books: for find_book in find_books: if find_book['BookID'] != bookid: # we have a book with this author/title already logger.debug('Rejecting bookid %s for [%s][%s] already got %s' % (find_book['BookID'], authorNameResult, bookname, bookid)) duplicates = duplicates + 1 rejected = True if not rejected: find_books = myDB.match('SELECT AuthorName,BookName FROM books WHERE BookID = "%s"' % bookid) if find_books: # we have a book with this bookid already if bookname != find_books['BookName'] or authorNameResult != find_books['AuthorName']: logger.debug('Rejecting bookid %s for [%s][%s] already got bookid for [%s][%s]' % (bookid, authorNameResult, bookname, find_books['AuthorName'], find_books['BookName'])) else: logger.debug('Rejecting bookid %s for [%s][%s] already got this book in database' % (bookid, authorNameResult, bookname)) duplicates = duplicates + 1 rejected = True if not rejected: if book_status != "Ignored": if not locked: controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorNameResult, "AuthorID": authorid, "AuthorLink": None, "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": None, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": pubyear, "BookLang": bookLanguage, "Status": book_status, "BookAdded": today(), "Series": series, "SeriesNum": seriesNum } resultsCount = resultsCount + 1 myDB.upsert("books", newValueDict, controlValueDict) logger.debug(u"Book found: " + book.find('title').text + " " + pubyear) if 'nocover' in bookimg or 'nophoto' in bookimg: # try to get a cover from librarything workcover = getBookCover(bookid) if workcover: logger.debug(u'Updated cover for %s to %s' % (bookname, workcover)) controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": workcover} myDB.upsert("books", newValueDict, controlValueDict) elif bookimg and bookimg.startswith('http'): link = cache_cover(bookid, bookimg) if link: controlValueDict = {"BookID": bookid} newValueDict = {"BookImg": link} myDB.upsert("books", newValueDict, controlValueDict) if seriesNum is None: # try to get series info from librarything series, seriesNum = getWorkSeries(bookid) if seriesNum: logger.debug(u'Updated series: %s [%s]' % (series, seriesNum)) controlValueDict = {"BookID": bookid} newValueDict = { "Series": series, "SeriesNum": seriesNum } myDB.upsert("books", newValueDict, controlValueDict) worklink = getWorkPage(bookid) if worklink: controlValueDict = {"BookID": bookid} newValueDict = {"WorkPage": worklink} myDB.upsert("books", newValueDict, controlValueDict) if not find_book_status: logger.debug(u"[%s] Added book: %s" % (authorname, bookname)) added_count = added_count + 1 else: logger.debug(u"[%s] Updated book: %s" % (authorname, bookname)) updated_count = updated_count + 1 else: book_ignore_count = book_ignore_count + 1 loopCount = loopCount + 1 URL = 'http://www.goodreads.com/author/list/' + authorid + '.xml?' + \ urllib.urlencode(self.params) + '&page=' + str(loopCount) resultxml = None try: rootxml, in_cache = get_xml_request(URL, useCache=not refresh) if rootxml is None: logger.debug('Error requesting next page of results') else: resultxml = rootxml.getiterator('book') if not in_cache: api_hits = api_hits + 1 except Exception as e: resultxml = None logger.error("Error finding next page of results: %s" % str(e)) if resultxml: if all(False for book in resultxml): # returns True if iterator is empty resultxml = None lastbook = myDB.match('SELECT BookName, BookLink, BookDate from books WHERE AuthorID="%s" \ AND Status != "Ignored" order by BookDate DESC' % authorid) if lastbook: lastbookname = lastbook['BookName'] lastbooklink = lastbook['BookLink'] lastbookdate = lastbook['BookDate'] else: lastbookname = None lastbooklink = None lastbookdate = None controlValueDict = {"AuthorID": authorid} newValueDict = { "Status": "Active", "LastBook": lastbookname, "LastLink": lastbooklink, "LastDate": lastbookdate } myDB.upsert("authors", newValueDict, controlValueDict) # This is here because GoodReads sometimes has several entries with the same BookID! modified_count = added_count + updated_count logger.debug("Found %s total book%s for author" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s for author" % (ignored, plural(ignored))) logger.debug( "Removed %s bad character or no-name result%s for author" % (removedResults, plural(removedResults))) logger.debug("Removed %s duplicate result%s for author" % (duplicates, plural(duplicates))) logger.debug("Found %s book%s by author marked as Ignored" % (book_ignore_count, plural(book_ignore_count))) logger.debug("Imported/Updated %s book%s for author" % (modified_count, plural(modified_count))) myDB.action('insert into stats values ("%s", %i, %i, %i, %i, %i, %i, %i, %i, %i)' % (authorname.replace('"', '""'), api_hits, gr_lang_hits, lt_lang_hits, gb_lang_change, cache_hits, ignored, removedResults, not_cached, duplicates)) if refresh: logger.info("[%s] Book processing complete: Added %s book%s / Updated %s book%s" % (authorname, added_count, plural(added_count), updated_count, plural(updated_count))) else: logger.info("[%s] Book processing complete: Added %s book%s to the database" % (authorname, added_count, plural(added_count))) except Exception as e: logger.error('Unhandled exception in GR.get_author_books: %s' % traceback.format_exc())