def find_book_in_db(myDB, author, book): # prefer an exact match on author & book match = myDB.action("SELECT BookID FROM books where AuthorName=? and BookName=?", [author, book]).fetchone() if match: logger.debug("Exact match [%s]" % book) return match["BookID"] else: # No exact match # Try a more complex fuzzy match against each book in the db by this author # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>65) # These are results that work well on my library, minimal false matches and no misses on books that should be matched # Maybe make ratios configurable in config.ini later # books = myDB.select('SELECT BookID,BookName FROM books where AuthorName="%s"' % author) best_ratio = 0 best_partial = 0 ratio_name = "" partial_name = "" ratio_id = 0 partial_id = 0 logger.debug("Found %s books for %s" % (len(books), author)) for a_book in books: # lowercase everything to raise fuzziness scores book_lower = book.lower() a_book_lower = a_book["BookName"].lower() # ratio = fuzz.ratio(book_lower, a_book_lower) partial = fuzz.partial_ratio(book_lower, a_book_lower) if ratio > best_ratio: best_ratio = ratio ratio_name = a_book["BookName"] ratio_id = a_book["BookID"] if partial > best_partial: best_partial = partial partial_name = a_book["BookName"] partial_id = a_book["BookID"] else: if partial == best_partial: # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest # this eliminates most false matches against omnibuses if a_book_lower.find(book_lower) < partial_name.lower().find(book_lower): logger.debug("Fuzz left prefer [%s] over [%s]" % (a_book["BookName"], partial_name)) best_partial = partial partial_name = a_book["BookName"] partial_id = a_book["BookID"] # if best_ratio > 90: logger.debug("Fuzz match ratio [%d] [%s] [%s]" % (best_ratio, book, ratio_name)) return ratio_id if best_partial > 65: logger.debug("Fuzz match partial [%d] [%s] [%s]" % (best_partial, book, partial_name)) return partial_id logger.debug( "Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s]" % (author, book, best_ratio, ratio_name, best_partial, partial_name) ) return 0
def find_results(self, searchterm=None, queue=None): """ GoogleBooks performs much better if we search for author OR title not both at once, so if searchterm is not isbn, two searches needed. Lazylibrarian searches use <ll> to separate title from author in searchterm If this token isn't present, it's an isbn or searchterm as supplied by user """ try: myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(searchterm): api_strings = ['isbn:'] api_hits = 0 ignored = 0 total_count = 0 no_author_count = 0 if ' <ll> ' in searchterm: # special token separates title from author title, authorname = searchterm.split(' <ll> ') else: title = '' authorname = '' fullterm = searchterm.replace(' <ll> ', '') logger.debug('Now searching Google Books API with searchterm: %s' % fullterm) for api_value in api_strings: set_url = self.url if api_value == "isbn:": set_url = set_url + urllib.quote( api_value + searchterm.encode(lazylibrarian.SYS_ENCODING)) elif api_value == 'intitle:': searchterm = fullterm if title: # just search for title title = title.split(' (')[ 0] # with out any series info searchterm = title searchterm = searchterm.replace("'", "").replace( '"', '') # and no quotes searchterm = searchterm.strip() set_url = set_url + \ urllib.quote(api_value + '"' + searchterm.encode(lazylibrarian.SYS_ENCODING) + '"') elif api_value == 'inauthor:': searchterm = fullterm if authorname: searchterm = authorname # just search for author set_url = set_url + \ urllib.quote(api_value + '"' + searchterm.encode(lazylibrarian.SYS_ENCODING) + '"') searchterm = searchterm.strip() startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 try: while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = get_json_request(URL) if not jsonresults: number_results = 0 else: if not in_cache: api_hits += 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn( 'Found no results for %s with value: %s' % (api_value, searchterm)) break else: pass except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break startindex += 40 for item in jsonresults['items']: total_count += 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug( 'Skipped a result without authorfield.') no_author_count += 1 continue try: bookname = item['volumeInfo']['title'] except KeyError: logger.debug('Skipped a result without title.') continue valid_langs = getList( lazylibrarian.CONFIG['IMP_PREFLANG']) booklang = '' if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = item['volumeInfo']['language'] if booklang not in valid_langs: logger.debug( 'Skipped %s with language %s' % (bookname, booklang)) ignored += 1 continue except KeyError: ignored += 1 logger.debug( 'Skipped %s where no language is found' % bookname) continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = "" try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = "" try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' bookdate = bookdate[:4] try: bookimg = item['volumeInfo']['imageLinks'][ 'thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = "" try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: num_reviews = item['volumeInfo'][ 'ratingsCount'] except KeyError: num_reviews = 0 try: if item['volumeInfo']['industryIdentifiers'][ 0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 if authorname: author_fuzz = fuzz.ratio(Author, authorname) else: author_fuzz = fuzz.ratio(Author, fullterm) if title: book_fuzz = fuzz.ratio(bookname, title) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(bookname)) words -= len(getList(title)) book_fuzz -= abs(words) else: book_fuzz = fuzz.ratio(bookname, fullterm) isbn_fuzz = 0 if is_valid_isbn(fullterm): isbn_fuzz = 100 highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz) dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace bookid = item['id'] author = myDB.select( 'SELECT AuthorID FROM authors WHERE AuthorName = "%s"' % Author.replace('"', '""')) if author: AuthorID = author[0]['authorid'] else: AuthorID = '' resultlist.append({ 'authorname': Author, 'authorid': AuthorID, 'bookid': bookid, 'bookname': bookname, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': num_reviews }) resultcount += 1 except KeyError: break logger.debug( "Returning %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, searchterm)) logger.debug("Found %s result%s" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug( 'The Google Books API was hit %s time%s for searchterm: %s' % (api_hits, plural(api_hits), fullterm)) queue.put(resultlist) except Exception: logger.error('Unhandled exception in GB.find_results: %s' % traceback.format_exc())
def find_results(self, searchterm=None, queue=None): """ GoogleBooks performs much better if we search for author OR title not both at once, so if searchterm is not isbn, two searches needed. Lazylibrarian searches use <ll> to separate title from author in searchterm If this token isn't present, it's an isbn or searchterm as supplied by user """ try: myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(searchterm): api_strings = ['isbn:'] api_hits = 0 ignored = 0 total_count = 0 no_author_count = 0 title = '' authorname = '' if ' <ll> ' in searchterm: # special token separates title from author title, authorname = searchterm.split(' <ll> ') fullterm = searchterm.replace(' <ll> ', ' ') logger.debug('Now searching Google Books API with searchterm: %s' % fullterm) for api_value in api_strings: set_url = self.url if api_value == "isbn:": set_url = set_url + quote(api_value + searchterm) elif api_value == 'intitle:': searchterm = fullterm if title: # just search for title # noinspection PyUnresolvedReferences title = title.split(' (')[0] # without any series info searchterm = title searchterm = searchterm.replace("'", "").replace( '"', '').strip() # and no quotes if PY2: searchterm = searchterm.encode( lazylibrarian.SYS_ENCODING) set_url = set_url + quote(api_value + '"' + searchterm + '"') elif api_value == 'inauthor:': searchterm = fullterm if authorname: searchterm = authorname # just search for author searchterm = searchterm.strip() if PY2: searchterm = searchterm.encode( lazylibrarian.SYS_ENCODING) set_url = set_url + quote_plus(api_value + '"' + searchterm + '"') startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 try: while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urlencode(self.params) try: jsonresults, in_cache = gb_json_request(URL) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits += 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn( 'Found no results for %s with value: %s' % (api_value, searchterm)) break else: pass except Exception as err: if hasattr(err, 'reason'): errmsg = err.reason else: errmsg = str(err) logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % errmsg) break startindex += 40 for item in jsonresults['items']: total_count += 1 book = bookdict(item) if not book['author']: logger.debug( 'Skipped a result without authorfield.') no_author_count += 1 continue if not book['name']: logger.debug('Skipped a result without title.') continue valid_langs = getList( lazylibrarian.CONFIG['IMP_PREFLANG']) if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = book['lang'] if booklang not in valid_langs: logger.debug( 'Skipped %s with language %s' % (book['name'], booklang)) ignored += 1 continue except KeyError: ignored += 1 logger.debug( 'Skipped %s where no language is found' % book['name']) continue if authorname: author_fuzz = fuzz.ratio( book['author'], authorname) else: author_fuzz = fuzz.ratio( book['author'], fullterm) if title: book_fuzz = fuzz.token_set_ratio( book['name'], title) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(book['name'])) words -= len(getList(title)) book_fuzz -= abs(words) else: book_fuzz = fuzz.token_set_ratio( book['name'], fullterm) isbn_fuzz = 0 if is_valid_isbn(fullterm): isbn_fuzz = 100 highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz) dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(book['name'], dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace AuthorID = '' if book['author']: match = myDB.match( 'SELECT AuthorID FROM authors WHERE AuthorName=?', (book['author'].replace('"', '""'), )) if match: AuthorID = match['AuthorID'] resultlist.append({ 'authorname': book['author'], 'authorid': AuthorID, 'bookid': item['id'], 'bookname': bookname, 'booksub': book['sub'], 'bookisbn': book['isbn'], 'bookpub': book['pub'], 'bookdate': book['date'], 'booklang': book['lang'], 'booklink': book['link'], 'bookrate': float(book['rate']), 'bookrate_count': book['rate_count'], 'bookimg': book['img'], 'bookpages': book['pages'], 'bookgenre': book['genre'], 'bookdesc': book['desc'], 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': book['ratings'] }) resultcount += 1 except KeyError: break logger.debug( "Returning %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, searchterm)) logger.debug("Found %s result%s" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug( 'The Google Books API was hit %s time%s for searchterm: %s' % (api_hits, plural(api_hits), fullterm)) queue.put(resultlist) except Exception: logger.error('Unhandled exception in GB.find_results: %s' % traceback.format_exc())
def find_results(self, authorname=None, queue=None): threading.currentThread().name = "GB-SEARCH" resultlist = [] #See if we should check ISBN field, otherwise ignore it try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): api_strings = ['isbn:'] else: api_strings = ['inauthor:', 'intitle:'] except: api_strings = ['inauthor:', 'intitle:'] api_hits = 0 logger.info('Now searching Google Books API with keyword: ' + self.name) for api_value in api_strings: startindex = 0 if api_value == "isbn:": set_url = self.url + urllib.quote(api_value + self.name) else: set_url = self.url + urllib.quote(api_value + '"' + self.name + '"') try: startindex = 0 resultcount = 0 removedResults = 0 ignored = 0 total_count = 0 no_author_count = 0 while True: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults = json.JSONDecoder().decode(urllib2.urlopen(URL, timeout=30).read()) api_hits = api_hits + 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.info('Found no results for %s with value: %s' % (api_value, self.name)) break else: pass except HTTPError, err: logger.warn('Google Books API Error [%s]: Check your API key or wait a while' % err.msg) break startindex = startindex+40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug('Skipped a result without authorfield.') no_author_count = no_author_count + 1 continue try: #skip if language is in ignore list booklang = item['volumeInfo']['language'] valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if booklang not in valid_langs: logger.debug('Skipped a book with language %s' % booklang) ignored = ignored + 1 continue except KeyError: ignored = ignored+1 logger.debug('Skipped a result where no language is found') continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' bookdate = bookdate[:4] try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: num_reviews = item['volumeInfo']['ratingsCount'] except KeyError: num_reviews = 0 try: if item['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo']['industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 author_fuzz = fuzz.ratio(Author.lower(), authorname.lower()) book_fuzz = fuzz.ratio(item['volumeInfo']['title'].lower(), authorname.lower()) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) # Darkie67: # replacing German Umlauts and filtering out ":" # booknamealt = item['volumeInfo']['title'] booknametmp1=booknamealt.replace(u'\xf6',u'oe') booknametmp2=booknametmp1.replace(u'\xe4',u'ae') booknametmp3=booknametmp2.replace(u'\xdf',u'ss') booknametmp4=booknametmp3.replace(u'\xc4',u'Ae') booknametmp5=booknametmp4.replace(u'\xdc',u'Ue') booknametmp6=booknametmp5.replace(u'\xd6',u'Oe') booknametmp7=booknametmp6.replace(':','') bookname=booknametmp7.replace(u'\xfc',u'ue') # Darkie67 end resultlist.append({ 'authorname': Author, 'bookid': item['id'], 'bookname': bookname, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': num_reviews }) resultcount = resultcount+1 if startindex >= number_results: logger.debug("Found %s total results" % total_count) logger.debug("Removed %s bad language results" % ignored) logger.debug("Removed %s books with no author" % no_author_count) logger.info("Showing %s results for (%s) with keyword: %s" % (resultcount, api_value, authorname)) break else: continue except KeyError: break logger.info('The Google Books API was hit %s times for keyword %s' % (str(api_hits), self.name)) queue.put(resultlist)
def LibraryScan(dir=None): if not dir: if not lazylibrarian.DOWNLOAD_DIR: return else: dir = lazylibrarian.DOWNLOAD_DIR if not os.path.isdir(dir): logger.warn("Cannot find directory: %s. Not scanning" % dir.decode(lazylibrarian.SYS_ENCODING, "replace")) return myDB = database.DBConnection() myDB.action("drop table if exists stats") myDB.action( "create table stats (authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, \ GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )" ) logger.info("Scanning ebook directory: %s" % dir.decode(lazylibrarian.SYS_ENCODING, "replace")) new_book_count = 0 file_count = 0 if lazylibrarian.FULL_SCAN: books = myDB.select('select AuthorName, BookName, BookFile, BookID from books where Status="Open"') status = lazylibrarian.NOTFOUND_STATUS logger.info("Missing books will be marked as %s" % status) for book in books: bookName = book["BookName"] bookAuthor = book["AuthorName"] bookID = book["BookID"] bookfile = book["BookFile"] if not (bookfile and os.path.isfile(bookfile)): myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID)) myDB.action('update books set BookFile="" where BookID="%s"' % bookID) logger.warn("Book %s - %s updated as not found on disk" % (bookAuthor, bookName)) # guess this was meant to save repeat-scans of the same directory # if it contains multiple formats of the same book, but there was no code # that looked at the array. renamed from latest to processed to make # purpose clearer processed_subdirectories = [] matchString = "" for char in lazylibrarian.EBOOK_DEST_FILE: matchString = matchString + "\\" + char # massage the EBOOK_DEST_FILE config parameter into something we can use # with regular expression matching booktypes = "" count = -1 booktype_list = formatter.getList(lazylibrarian.EBOOK_TYPE) for book_type in booktype_list: count += 1 if count == 0: booktypes = book_type else: booktypes = booktypes + "|" + book_type matchString = ( matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace("\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + "\.[" + booktypes + "]" ) pattern = re.compile(matchString, re.VERBOSE) for r, d, f in os.walk(dir): for directory in d[:]: if directory.startswith("."): d.remove(directory) # prevent magazine being scanned if directory.startswith("_"): d.remove(directory) for files in f: file_count += 1 subdirectory = r.replace(dir, "") # Added new code to skip if we've done this directory before. # Made this conditional with a switch in config.ini # in case user keeps multiple different books in the same subdirectory if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories): logger.debug("[%s] already scanned" % subdirectory) else: # If this is a book, try to get author/title/isbn/language # if epub or mobi, read metadata from the book # If metadata.opf exists, use that allowing it to override # embedded metadata. User may have edited metadata.opf # to merge author aliases together # If all else fails, try pattern match for author/title # and look up isbn/lang from LT or GR later match = 0 if formatter.is_valid_booktype(files): logger.debug( "[%s] Now scanning subdirectory %s" % ( dir.decode(lazylibrarian.SYS_ENCODING, "replace"), subdirectory.decode(lazylibrarian.SYS_ENCODING, "replace"), ) ) language = "Unknown" isbn = "" book = "" author = "" words = files.split(".") extn = words[len(words) - 1] # if it's an epub or a mobi we can try to read metadata from it if (extn == "epub") or (extn == "mobi"): book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING) try: res = get_book_info(book_filename) except: res = {} if "title" in res and "creator" in res: # this is the minimum we need match = 1 book = res["title"] author = res["creator"] if "language" in res: language = res["language"] if "identifier" in res: isbn = res["identifier"] if "type" in res: extn = res["type"] logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" % (isbn, language, author, book, extn)) else: logger.debug("Book meta incomplete in %s" % book_filename) # calibre uses "metadata.opf", LL uses "bookname - authorname.opf" # just look for any .opf file in the current directory since we don't know # LL preferred authorname/bookname at this point. # Allow metadata in file to override book contents as may be users pref metafile = opf_file(r) try: res = get_book_info(metafile) except: res = {} if "title" in res and "creator" in res: # this is the minimum we need match = 1 book = res["title"] author = res["creator"] if "language" in res: language = res["language"] if "identifier" in res: isbn = res["identifier"] logger.debug("file meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book)) else: logger.debug("File meta incomplete in %s" % metafile) if not match: # no author/book from metadata file, and not embedded either match = pattern.match(files) if match: author = match.group("author") book = match.group("book") else: logger.debug("Pattern match failed [%s]" % files) if match: # flag that we found a book in this subdirectory processed_subdirectories.append(subdirectory) # If we have a valid looking isbn, and language != "Unknown", add it to cache if language != "Unknown" and formatter.is_valid_isbn(isbn): logger.debug("Found Language [%s] ISBN [%s]" % (language, isbn)) # we need to add it to language cache if not already # there, is_valid_isbn has checked length is 10 or 13 if len(isbn) == 10: isbnhead = isbn[0:3] else: isbnhead = isbn[3:6] match = myDB.action('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone() if not match: myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, language)) logger.debug("Cached Lang [%s] ISBN [%s]" % (language, isbnhead)) else: logger.debug("Already cached Lang [%s] ISBN [%s]" % (language, isbnhead)) # get authors name in a consistent format if "," in author: # "surname, forename" words = author.split(",") author = words[1].strip() + " " + words[0].strip() # "forename surname" if author[1] == " ": author = author.replace(" ", ".") author = author.replace("..", ".") # Check if the author exists, and import the author if not, # before starting any complicated book-name matching to save repeating the search # check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author ).fetchone() if not check_exist_author and lazylibrarian.ADD_AUTHOR: # no match for supplied author, but we're allowed to # add new ones GR = GoodReads(author) try: author_gr = GR.find_author_id() except: logger.warn("Error finding author id for [%s]" % author) continue # only try to add if GR data matches found author data if author_gr: authorname = author_gr["authorname"] # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien" match_auth = author.replace(".", "_") match_auth = match_auth.replace(" ", "_") match_auth = match_auth.replace("__", "_") match_name = authorname.replace(".", "_") match_name = match_name.replace(" ", "_") match_name = match_name.replace("__", "_") match_name = common.remove_accents(match_name) match_auth = common.remove_accents(match_auth) # allow a degree of fuzziness to cater for different accented character handling. # some author names have accents, # filename may have the accented or un-accented version of the character # The currently non-configurable value of fuzziness might need to go in config # We stored GoodReads unmodified author name in # author_gr, so store in LL db under that # fuzz.ratio doesn't lowercase for us match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower()) if match_fuzz < 90: logger.debug("Failed to match author [%s] fuzz [%d]" % (author, match_fuzz)) logger.debug( "Failed to match author [%s] to authorname [%s]" % (match_auth, match_name) ) # To save loading hundreds of books by unknown # authors at GR or GB, ignore if author "Unknown" if (author != "Unknown") and (match_fuzz >= 90): # use "intact" name for author that we stored in # GR author_dict, not one of the various mangled versions # otherwise the books appear to be by a different author! author = author_gr["authorname"] # this new authorname may already be in the # database, so check again check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author ).fetchone() if not check_exist_author: logger.debug("Adding new author [%s]" % author) try: importer.addAuthorToDB(author) check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author ).fetchone() except: continue # check author exists in db, either newly loaded or already there if not check_exist_author: logger.debug("Failed to match author [%s] in database" % author) else: # author exists, check if this book by this author is in our database # metadata might have quotes in book name book = book.replace('"', "").replace("'", "") bookid = find_book_in_db(myDB, author, book) if bookid: # check if book is already marked as "Open" (if so, # we already had it) check_status = myDB.action( 'SELECT Status from books where BookID="%s"' % bookid ).fetchone() if check_status["Status"] != "Open": # update status as we've got this book myDB.action('UPDATE books set Status="Open" where BookID="%s"' % bookid) book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING) # update book location so we can check if it # gets removed, or allow click-to-open myDB.action( 'UPDATE books set BookFile="%s" where BookID="%s"' % (book_filename, bookid) ) new_book_count += 1 cachesize = myDB.action("select count('ISBN') as counter from languages").fetchone() logger.info("%s new/modified books found and added to the database" % new_book_count) logger.info("%s files processed" % file_count) stats = myDB.action( "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \ sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats" ).fetchone() if stats["sum(GR_book_hits)"] is not None: # only show stats if new books added if lazylibrarian.BOOK_API == "GoogleBooks": logger.debug("GoogleBooks was hit %s times for books" % stats["sum(GR_book_hits)"]) logger.debug("GoogleBooks language was changed %s times" % stats["sum(GB_lang_change)"]) if lazylibrarian.BOOK_API == "GoodReads": logger.debug("GoodReads was hit %s times for books" % stats["sum(GR_book_hits)"]) logger.debug("GoodReads was hit %s times for languages" % stats["sum(GR_lang_hits)"]) logger.debug("LibraryThing was hit %s times for languages" % stats["sum(LT_lang_hits)"]) logger.debug("Language cache was hit %s times" % stats["sum(cache_hits)"]) logger.debug("Unwanted language removed %s books" % stats["sum(bad_lang)"]) logger.debug("Unwanted characters removed %s books" % stats["sum(bad_char)"]) logger.debug("Unable to cache %s books with missing ISBN" % stats["sum(uncached)"]) logger.debug("Cache %s hits, %s miss" % (lazylibrarian.CACHE_HIT, lazylibrarian.CACHE_MISS)) logger.debug("ISBN Language cache holds %s entries" % cachesize["counter"]) stats = len(myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"')) if stats: logger.warn("There are %s books in your library with unknown language" % stats) authors = myDB.select("select AuthorName from authors") # Update bookcounts for all authors, not just new ones - refresh may have located # new books for existing authors especially if switched provider gb/gr logger.debug("Updating bookcounts for %i authors" % len(authors)) for author in authors: name = author["AuthorName"] havebooks = myDB.action( 'SELECT count("BookID") as counter from books WHERE AuthorName="%s" AND (Status="Have" OR Status="Open")' % name ).fetchone() myDB.action('UPDATE authors set HaveBooks="%s" where AuthorName="%s"' % (havebooks["counter"], name)) totalbooks = myDB.action( 'SELECT count("BookID") as counter FROM books WHERE AuthorName="%s" AND Status!="Ignored"' % name ).fetchone() myDB.action('UPDATE authors set UnignoredBooks="%s" where AuthorName="%s"' % (totalbooks["counter"], name)) logger.info("Library scan complete")
def find_book_in_db(myDB, author, book): # PAB fuzzy search for book in library, return LL bookid if found or zero # if not, return bookid to more easily update status # prefer an exact match on author & book match = myDB.match( 'SELECT BookID FROM books where AuthorName="%s" and BookName="%s"' % (author.replace('"', '""'), book.replace('"', '""'))) if match: logger.debug('Exact match [%s]' % book) return match['BookID'] else: # No exact match # Try a more complex fuzzy match against each book in the db by this author # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>75) # These are results that work well on my library, minimal false matches and no misses # on books that should be matched # Maybe make ratios configurable in config.ini later books = myDB.select( 'SELECT BookID,BookName FROM books where AuthorName="%s"' % author.replace('"', '""')) best_ratio = 0 best_partial = 0 best_partname = 0 ratio_name = "" partial_name = "" partname_name = "" ratio_id = 0 partial_id = 0 partname_id = 0 partname = 0 book_lower = unaccented(book.lower()) book_partname = '' if ':' in book_lower: book_partname = book_lower.split(':')[0] for a_book in books: # tidy up everything to raise fuzziness scores # still need to lowercase for matching against partial_name later on a_book_lower = unaccented(a_book['BookName'].lower()) # ratio = fuzz.ratio(book_lower, a_book_lower) partial = fuzz.partial_ratio(book_lower, a_book_lower) if book_partname: partname = fuzz.partial_ratio(book_partname, a_book_lower) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(book_lower)) words -= len(getList(a_book_lower)) ratio -= abs(words) partial -= abs(words) if ratio > best_ratio: best_ratio = ratio ratio_name = a_book['BookName'] ratio_id = a_book['BookID'] if partial > best_partial: best_partial = partial partial_name = a_book['BookName'] partial_id = a_book['BookID'] if partname > best_partname: best_partname = partname partname_name = a_book['BookName'] partname_id = a_book['BookID'] if partial == best_partial: # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest # this eliminates most false matches against omnibuses # find the position of the shortest string in the longest if len(getList(book_lower)) >= len(getList(a_book_lower)): match1 = book_lower.find(a_book_lower) else: match1 = a_book_lower.find(book_lower) if len(getList(book_lower)) >= len( getList(partial_name.lower())): match2 = book_lower.find(partial_name.lower()) else: match2 = partial_name.lower().find(book_lower) if match1 < match2: logger.debug( "Fuzz left change, prefer [%s] over [%s] for [%s]" % (a_book['BookName'], partial_name, book)) best_partial = partial partial_name = a_book['BookName'] partial_id = a_book['BookID'] if best_ratio > 90: logger.debug("Fuzz match ratio [%d] [%s] [%s]" % (best_ratio, book, ratio_name)) return ratio_id if best_partial > 75: logger.debug("Fuzz match partial [%d] [%s] [%s]" % (best_partial, book, partial_name)) return partial_id if best_partname > 95: logger.debug("Fuzz match partname [%d] [%s] [%s]" % (best_partname, book, partname_name)) return partname_id logger.debug( 'Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s], partname [%d,%s]' % (author, book, best_ratio, ratio_name, best_partial, partial_name, best_partname, partname_name)) return 0
def find_results(self, authorname=None, queue=None): threading.currentThread().name = "GB-SEARCH" resultlist = [] #See if we should check ISBN field, otherwise ignore it try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): api_strings = ['isbn:'] else: api_strings = ['inauthor:', 'intitle:'] except: api_strings = ['inauthor:', 'intitle:'] api_hits = 0 logger.info('Now searching Google Books API with keyword: ' + self.name) for api_value in api_strings: startindex = 0 if api_value == "isbn:": set_url = self.url + urllib.quote(api_value + self.name) else: set_url = self.url + urllib.quote(api_value + '"' + self.name + '"') try: startindex = 0 resultcount = 0 removedResults = 0 ignored = 0 total_count = 0 no_author_count = 0 while True: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults = json.JSONDecoder().decode( urllib2.urlopen(URL, timeout=30).read()) api_hits = api_hits + 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.info( 'Found no results for %s with value: %s' % (api_value, self.name)) break else: pass except HTTPError, err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.msg) break startindex = startindex + 40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug( 'Skipped a result without authorfield.') no_author_count = no_author_count + 1 continue try: #skip if language is in ignore list booklang = item['volumeInfo']['language'] valid_langs = ([ valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',') ]) if booklang not in valid_langs: logger.debug( 'Skipped a book with language %s' % booklang) ignored = ignored + 1 continue except KeyError: ignored = ignored + 1 logger.debug( 'Skipped a result where no language is found') continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' bookdate = bookdate[:4] try: bookimg = item['volumeInfo']['imageLinks'][ 'thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: num_reviews = item['volumeInfo']['ratingsCount'] except KeyError: num_reviews = 0 try: if item['volumeInfo']['industryIdentifiers'][0][ 'type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 author_fuzz = fuzz.ratio(Author.lower(), authorname.lower()) book_fuzz = fuzz.ratio( item['volumeInfo']['title'].lower(), authorname.lower()) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len( str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) resultlist.append({ 'authorname': Author, 'bookid': item['id'], 'bookname': item['volumeInfo']['title'], 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': num_reviews }) resultcount = resultcount + 1 if startindex >= number_results: logger.debug("Found %s total results" % total_count) logger.debug("Removed %s bad language results" % ignored) logger.debug("Removed %s books with no author" % no_author_count) logger.info( "Showing %s results for (%s) with keyword: %s" % (resultcount, api_value, authorname)) break else: continue except KeyError: break logger.info('The Google Books API was hit %s times for keyword %s' % (str(api_hits), self.name)) queue.put(resultlist)
def LibraryScan(dir=None): if not dir: if not lazylibrarian.DOWNLOAD_DIR: return else: dir = lazylibrarian.DOWNLOAD_DIR if not os.path.isdir(dir): logger.warn( 'Cannot find directory: %s. Not scanning' % dir.decode(lazylibrarian.SYS_ENCODING, 'replace')) return myDB = database.DBConnection() myDB.action('drop table if exists stats') myDB.action( 'create table stats (authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, \ GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )') logger.info( 'Scanning ebook directory: %s' % dir.decode(lazylibrarian.SYS_ENCODING, 'replace')) new_book_count = 0 file_count = 0 if lazylibrarian.FULL_SCAN: books = myDB.select( 'select AuthorName, BookName, BookFile, BookID from books where Status="Open"') status = lazylibrarian.NOTFOUND_STATUS logger.info('Missing books will be marked as %s' % status) for book in books: bookName = book['BookName'] bookAuthor = book['AuthorName'] bookID = book['BookID'] bookfile = book['BookFile'] if not(bookfile and os.path.isfile(bookfile)): myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID)) myDB.action('update books set BookFile="" where BookID="%s"' % bookID) logger.warn('Book %s - %s updated as not found on disk' % (bookAuthor, bookName)) # to save repeat-scans of the same directory if it contains multiple formats of the same book, # keep track of which directories we've already looked at processed_subdirectories = [] matchString = '' for char in lazylibrarian.EBOOK_DEST_FILE: matchString = matchString + '\\' + char # massage the EBOOK_DEST_FILE config parameter into something we can use # with regular expression matching booktypes = '' count = -1 booktype_list = formatter.getList(lazylibrarian.EBOOK_TYPE) for book_type in booktype_list: count += 1 if count == 0: booktypes = book_type else: booktypes = booktypes + '|' + book_type matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace( "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']' pattern = re.compile(matchString, re.VERBOSE) for r, d, f in os.walk(dir): for directory in d[:]: if directory.startswith("."): d.remove(directory) # prevent magazine being scanned if directory.startswith("_"): d.remove(directory) for files in f: file_count += 1 if isinstance(r, str): r = r.decode('utf-8') subdirectory = r.replace(dir, '') # Added new code to skip if we've done this directory before. # Made this conditional with a switch in config.ini # in case user keeps multiple different books in the same subdirectory if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories): logger.debug("[%s] already scanned" % subdirectory) else: # If this is a book, try to get author/title/isbn/language # if epub or mobi, read metadata from the book # If metadata.opf exists, use that allowing it to override # embedded metadata. User may have edited metadata.opf # to merge author aliases together # If all else fails, try pattern match for author/title # and look up isbn/lang from LT or GR later match = 0 if formatter.is_valid_booktype(files): logger.debug("[%s] Now scanning subdirectory %s" % (dir, subdirectory)) language = "Unknown" isbn = "" book = "" author = "" words = files.split('.') extn = words[len(words) - 1] # if it's an epub or a mobi we can try to read metadata from it if (extn == "epub") or (extn == "mobi"): book_filename = os.path.join( r.encode(lazylibrarian.SYS_ENCODING), files.encode(lazylibrarian.SYS_ENCODING)) try: res = get_book_info(book_filename) except: res = {} if 'title' in res and 'creator' in res: # this is the minimum we need match = 1 book = res['title'] author = res['creator'] if 'language' in res: language = res['language'] if 'identifier' in res: isbn = res['identifier'] if 'type' in res: extn = res['type'] logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" % (isbn, language, author, book, extn)) else: logger.debug("Book meta incomplete in %s" % book_filename) # calibre uses "metadata.opf", LL uses "bookname - authorname.opf" # just look for any .opf file in the current directory since we don't know # LL preferred authorname/bookname at this point. # Allow metadata in file to override book contents as may be users pref metafile = opf_file(r) try: res = get_book_info(metafile) except: res = {} if 'title' in res and 'creator' in res: # this is the minimum we need match = 1 book = res['title'] author = res['creator'] if 'language' in res: language = res['language'] if 'identifier' in res: isbn = res['identifier'] logger.debug( "file meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book)) else: logger.debug("File meta incomplete in %s" % metafile) if not match: # no author/book from metadata file, and not embedded either match = pattern.match(files) if match: author = match.group("author") book = match.group("book") else: logger.debug("Pattern match failed [%s]" % files) if match: # flag that we found a book in this subdirectory processed_subdirectories.append(subdirectory) # If we have a valid looking isbn, and language != "Unknown", add it to cache if language != "Unknown" and formatter.is_valid_isbn(isbn): logger.debug( "Found Language [%s] ISBN [%s]" % (language, isbn)) # we need to add it to language cache if not already # there, is_valid_isbn has checked length is 10 or 13 if len(isbn) == 10: isbnhead = isbn[0:3] else: isbnhead = isbn[3:6] match = myDB.action( 'SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone() if not match: myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, language)) logger.debug( "Cached Lang [%s] ISBN [%s]" % (language, isbnhead)) else: logger.debug( "Already cached Lang [%s] ISBN [%s]" % (language, isbnhead)) # get authors name in a consistent format if "," in author: # "surname, forename" words = author.split(',') author = words[1].strip() + ' ' + words[0].strip() # "forename surname" if author[1] == ' ': author = author.replace(' ', '.') author = author.replace('..', '.') # Check if the author exists, and import the author if not, # before starting any complicated book-name matching to save repeating the search # check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author).fetchone() if not check_exist_author and lazylibrarian.ADD_AUTHOR: # no match for supplied author, but we're allowed to # add new ones GR = GoodReads(author) try: author_gr = GR.find_author_id() except: logger.warn( "Error finding author id for [%s]" % author) continue # only try to add if GR data matches found author data if author_gr: authorname = author_gr['authorname'] # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien" match_auth = author.replace('.', '_') match_auth = match_auth.replace(' ', '_') match_auth = match_auth.replace('__', '_') match_name = authorname.replace('.', '_') match_name = match_name.replace(' ', '_') match_name = match_name.replace('__', '_') match_name = common.remove_accents(match_name) match_auth = common.remove_accents(match_auth) # allow a degree of fuzziness to cater for different accented character handling. # some author names have accents, # filename may have the accented or un-accented version of the character # The currently non-configurable value of fuzziness might need to go in config # We stored GoodReads unmodified author name in # author_gr, so store in LL db under that # fuzz.ratio doesn't lowercase for us match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower()) if match_fuzz < 90: logger.debug( "Failed to match author [%s] fuzz [%d]" % (author, match_fuzz)) logger.debug( "Failed to match author [%s] to authorname [%s]" % (match_auth, match_name)) # To save loading hundreds of books by unknown # authors at GR or GB, ignore if author "Unknown" if (author != "Unknown") and (match_fuzz >= 90): # use "intact" name for author that we stored in # GR author_dict, not one of the various mangled versions # otherwise the books appear to be by a different author! author = author_gr['authorname'] # this new authorname may already be in the # database, so check again check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author).fetchone() if not check_exist_author: logger.debug( "Adding new author [%s]" % author) try: importer.addAuthorToDB(author) check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author).fetchone() except: continue # check author exists in db, either newly loaded or already there if not check_exist_author: logger.debug( "Failed to match author [%s] in database" % author) else: # author exists, check if this book by this author is in our database # metadata might have quotes in book name book = book.replace('"', '').replace("'", "") bookid = find_book_in_db(myDB, author, book) if bookid: # check if book is already marked as "Open" (if so, # we already had it) check_status = myDB.action( 'SELECT Status from books where BookID="%s"' % bookid).fetchone() if check_status['Status'] != 'Open': # update status as we've got this book myDB.action( 'UPDATE books set Status="Open" where BookID="%s"' % bookid) book_filename = os.path.join(r, files) # update book location so we can check if it # gets removed, or allow click-to-open myDB.action( 'UPDATE books set BookFile="%s" where BookID="%s"' % (book_filename, bookid)) new_book_count += 1 cachesize = myDB.action("select count('ISBN') as counter from languages").fetchone() logger.info( "%s new/modified books found and added to the database" % new_book_count) logger.info("%s files processed" % file_count) stats = myDB.action( "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \ sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats").fetchone() if stats['sum(GR_book_hits)'] is not None: # only show stats if new books added if lazylibrarian.BOOK_API == "GoogleBooks": logger.debug( "GoogleBooks was hit %s times for books" % stats['sum(GR_book_hits)']) logger.debug( "GoogleBooks language was changed %s times" % stats['sum(GB_lang_change)']) if lazylibrarian.BOOK_API == "GoodReads": logger.debug( "GoodReads was hit %s times for books" % stats['sum(GR_book_hits)']) logger.debug( "GoodReads was hit %s times for languages" % stats['sum(GR_lang_hits)']) logger.debug( "LibraryThing was hit %s times for languages" % stats['sum(LT_lang_hits)']) logger.debug( "Language cache was hit %s times" % stats['sum(cache_hits)']) logger.debug( "Unwanted language removed %s books" % stats['sum(bad_lang)']) logger.debug( "Unwanted characters removed %s books" % stats['sum(bad_char)']) logger.debug( "Unable to cache %s books with missing ISBN" % stats['sum(uncached)']) logger.debug("Cache %s hits, %s miss" % (lazylibrarian.CACHE_HIT, lazylibrarian.CACHE_MISS)) logger.debug("ISBN Language cache holds %s entries" % cachesize['counter']) stats = len(myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"')) if stats: logger.warn("There are %s books in your library with unknown language" % stats) authors = myDB.select('select AuthorName from authors') # Update bookcounts for all authors, not just new ones - refresh may have located # new books for existing authors especially if switched provider gb/gr logger.debug('Updating bookcounts for %i authors' % len(authors)) for author in authors: name = author['AuthorName'] havebooks = myDB.action( 'SELECT count("BookID") as counter from books WHERE AuthorName="%s" AND (Status="Have" OR Status="Open")' % name).fetchone() myDB.action('UPDATE authors set HaveBooks="%s" where AuthorName="%s"' % (havebooks['counter'], name)) totalbooks = myDB.action( 'SELECT count("BookID") as counter FROM books WHERE AuthorName="%s"' % name).fetchone() myDB.action('UPDATE authors set TotalBooks="%s" where AuthorName="%s"' % (totalbooks['counter'], name)) unignoredbooks = myDB.action( 'SELECT count("BookID") as counter FROM books WHERE AuthorName="%s" AND Status!="Ignored"' % name).fetchone() myDB.action('UPDATE authors set UnignoredBooks="%s" where AuthorName="%s"' % (unignoredbooks['counter'], name)) covers = myDB.action("select count('bookimg') as counter from books where bookimg like 'http%'").fetchone() logger.info("Caching covers for %s books" % covers['counter']) images = myDB.action('select bookid, bookimg, bookname from books where bookimg like "http%"') for item in images: bookid = item['bookid'] bookimg = item['bookimg'] bookname = item['bookname'] newimg = bookwork.cache_cover(bookid, bookimg) if newimg != bookimg: myDB.action('update books set BookImg="%s" where BookID="%s"' % (newimg, bookid)) logger.info('Library scan complete')
def LibraryScan(dir=None): if not dir: if not lazylibrarian.DOWNLOAD_DIR: return else: dir = lazylibrarian.DOWNLOAD_DIR if not os.path.isdir(dir): logger.warn("Cannot find directory: %s. Not scanning" % dir.decode(lazylibrarian.SYS_ENCODING, "replace")) return myDB = database.DBConnection() myDB.action("drop table if exists stats") myDB.action( "create table stats ( authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )" ) new_authors = [] logger.info("Scanning ebook directory: %s" % dir.decode(lazylibrarian.SYS_ENCODING, "replace")) book_list = [] new_book_count = 0 file_count = 0 book_exists = False if lazylibrarian.FULL_SCAN: books = myDB.select("select AuthorName, BookName, BookFile, BookID from books where Status=?", [u"Open"]) status = lazylibrarian.NOTFOUND_STATUS logger.info("Missing books will be marked as %s" % status) for book in books: bookName = book["BookName"] bookAuthor = book["AuthorName"] bookID = book["BookID"] bookfile = book["BookFile"] if os.path.isfile(bookfile): book_exists = True else: myDB.action("update books set Status=? where BookID=?", [status, bookID]) myDB.action('update books set BookFile="" where BookID=?', [bookID]) logger.info("Book %s updated as not found on disk" % bookfile) # for book_type in getList(lazylibrarian.EBOOK_TYPE): # bookName = book['BookName'] # bookAuthor = book['AuthorName'] # #Default destination path, should be allowed change per config file. # dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', bookAuthor).replace('$Title', bookName) # #dest_path = authorname+'/'+bookname # global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', bookAuthor).replace('$Title', bookName) # # encoded_book_path = os.path.join(dir,dest_path,global_name + "." + book_type).encode(lazylibrarian.SYS_ENCODING) # if os.path.isfile(encoded_book_path): # book_exists = True # if not book_exists: # myDB.action('update books set Status=? where AuthorName=? and BookName=?',[status,bookAuthor,bookName]) # logger.info('Book %s updated as not found on disk' % encoded_book_path.decode(lazylibrarian.SYS_ENCODING, 'replace') ) if bookAuthor not in new_authors: new_authors.append(bookAuthor) # guess this was meant to save repeat-scans of the same directory # if it contains multiple formats of the same book, but there was no code # that looked at the array. renamed from latest to processed to make purpose clearer processed_subdirectories = [] matchString = "" for char in lazylibrarian.EBOOK_DEST_FILE: matchString = matchString + "\\" + char # massage the EBOOK_DEST_FILE config parameter into something we can use with regular expression matching booktypes = "" count = -1 booktype_list = getList(lazylibrarian.EBOOK_TYPE) for book_type in booktype_list: count += 1 if count == 0: booktypes = book_type else: booktypes = booktypes + "|" + book_type matchString = ( matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace("\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + "\.[" + booktypes + "]" ) pattern = re.compile(matchString, re.VERBOSE) for r, d, f in os.walk(dir): for directory in d[:]: if directory.startswith("."): d.remove(directory) # prevent magazine being scanned if directory.startswith("_"): d.remove(directory) for files in f: file_count += 1 subdirectory = r.replace(dir, "") # Added new code to skip if we've done this directory before. Made this conditional with a switch in config.ini # in case user keeps multiple different books in the same subdirectory if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories): logger.debug("[%s] already scanned" % subdirectory) else: logger.info( "[%s] Now scanning subdirectory %s" % ( dir.decode(lazylibrarian.SYS_ENCODING, "replace"), subdirectory.decode(lazylibrarian.SYS_ENCODING, "replace"), ) ) # If this is a book, try to get author/title/isbn/language # If metadata.opf exists, use that # else if epub or mobi, read metadata from the book # else have to try pattern match for author/title and look up isbn/lang from LT or GR later # # Is it a book (extension found in booktypes) match = 0 words = files.split(".") extn = words[len(words) - 1] if extn in booktypes: # see if there is a metadata file in this folder with the info we need try: metafile = os.path.join(r, "metadata.opf").encode(lazylibrarian.SYS_ENCODING) res = get_book_info(metafile) if res: book = res["title"] author = res["creator"] language = res["language"] isbn = res["identifier"] match = 1 logger.debug("file meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book)) except: logger.debug("No metadata file in %s" % r) if not match: # it's a book, but no external metadata found # if it's an epub or a mobi we can try to read metadata from it if (extn == "epub") or (extn == "mobi"): book_file = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING) res = get_book_info(book_file) if res: book = res["title"] author = res["creator"] language = res["language"] isbn = res["identifier"] match = 1 logger.debug("book meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book)) if not match: match = pattern.match(files) if match: author = match.group("author") book = match.group("book") else: logger.debug("Pattern match failed [%s]" % files) else: processed_subdirectories.append(subdirectory) # flag that we found a book in this subdirectory # # If we have a valid looking isbn, and language != "Unknown", add it to cache # if not language: language = "Unknown" # strip any formatting from the isbn isbn = re.sub("[- ]", "", isbn) if len(isbn) != 10 and len(isbn) != 13: isbn = "" if not isbn.isdigit(): isbn = "" if isbn != "" and language != "Unknown": logger.debug("Found Language [%s] ISBN [%s]" % (language, isbn)) # we need to add it to language cache if not already there if len(isbn) == 10: isbnhead = isbn[0:3] else: isbnhead = isbn[3:6] match = myDB.action('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone() if not match: myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, language)) logger.debug("Cached Lang [%s] ISBN [%s]" % (language, isbnhead)) else: logger.debug("Already cached Lang [%s] ISBN [%s]" % (language, isbnhead)) # get authors name in a consistent format if "," in author: # "surname, forename" words = author.split(",") author = words[1].strip() + " " + words[0].strip() # "forename surname" author = author.replace(". ", " ") author = author.replace(".", " ") author = author.replace(" ", " ") # Check if the author exists, and import the author if not, # before starting any complicated book-name matching to save repeating the search # check_exist_author = myDB.action("SELECT * FROM authors where AuthorName=?", [author]).fetchone() if not check_exist_author and lazylibrarian.ADD_AUTHOR: # no match for supplied author, but we're allowed to add new ones GR = GoodReads(author) try: author_gr = GR.find_author_id() except: logger.error("Error finding author id for [%s]" % author) continue # only try to add if GR data matches found author data # not sure what this is for, never seems to fail?? if author_gr: authorname = author_gr["authorname"] # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien" match_auth = author.replace(".", "_") match_auth = match_auth.replace(" ", "_") match_auth = match_auth.replace("__", "_") match_name = authorname.replace(".", "_") match_name = match_name.replace(" ", "_") match_name = match_name.replace("__", "_") # allow a degree of fuzziness to cater for different accented character handling. # some author names have accents, # filename may have the accented or un-accented version of the character # The (currently non-configurable) value of fuzziness works for one accented character # We stored GoodReads unmodified author name in author_gr, so store in LL db under that match_fuzz = fuzz.ratio(match_auth, match_name) if match_fuzz < 90: logger.info("Failed to match author [%s] fuzz [%d]" % (author, match_fuzz)) logger.info("match author [%s] authorname [%s]" % (match_auth, match_name)) # To save loading hundreds of books by unknown authors at GR or GB, ignore if author "Unknown" if (author != "Unknown") and (match_fuzz >= 90): # use "intact" name for author that we stored in # GR author_dict, not one of the various mangled versions # otherwise the books appear to be by a different author! author = author_gr["authorname"] # this new authorname may already be in the database, so check again check_exist_author = myDB.action( "SELECT * FROM authors where AuthorName=?", [author] ).fetchone() if not check_exist_author: logger.info("Adding new author [%s]" % author) if author not in new_authors: new_authors.append(author) try: importer.addAuthorToDB(author) check_exist_author = myDB.action( "SELECT * FROM authors where AuthorName=?", [author] ).fetchone() except: continue # check author exists in db, either newly loaded or already there if not check_exist_author: logger.info("Failed to match author [%s] in database" % author) else: # author exists, check if this book by this author is in our database bookid = find_book_in_db(myDB, author, book) if bookid: # check if book is already marked as "Open" (if so, we already had it) check_status = myDB.action("SELECT Status from books where BookID=?", [bookid]).fetchone() if check_status["Status"] != "Open": # update status as we've got this book myDB.action("UPDATE books set Status=? where BookID=?", [u"Open", bookid]) book_file = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING) # update book location so we can check if it gets removed, or maybe allow click-to-open? myDB.action("UPDATE books set BookFile=? where BookID=?", [book_file, bookid]) new_book_count += 1 cachesize = myDB.action("select count(*) from languages").fetchone() logger.info("%s new/modified books found and added to the database" % new_book_count) logger.info("%s files processed" % file_count) stats = myDB.action( "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats" ).fetchone() if lazylibrarian.BOOK_API == "GoogleBooks": logger.info("GoogleBooks was hit %s times for books" % stats["sum(GR_book_hits)"]) logger.info("GoogleBooks language was changed %s times" % stats["sum(GB_lang_change)"]) if lazylibrarian.BOOK_API == "GoodReads": logger.info("GoodReads was hit %s times for books" % stats["sum(GR_book_hits)"]) logger.info("GoodReads was hit %s times for languages" % stats["sum(GR_lang_hits)"]) logger.info("LibraryThing was hit %s times for languages" % stats["sum(LT_lang_hits)"]) logger.info("Language cache was hit %s times" % stats["sum(cache_hits)"]) logger.info("Unwanted language removed %s books" % stats["sum(bad_lang)"]) logger.info("Unwanted characters removed %s books" % stats["sum(bad_char)"]) logger.info("Unable to cache %s books with missing ISBN" % stats["sum(uncached)"]) logger.info("ISBN Language cache holds %s entries" % cachesize["count(*)"]) stats = len(myDB.select("select BookID from Books where status=? and BookLang=?", ["Open", "Unknown"])) logger.info("There are %s books in your library with unknown language" % stats) logger.info("Updating %i authors" % len(new_authors)) for auth in new_authors: havebooks = len(myDB.select("select BookName from Books where status=? and AuthorName=?", ["Open", auth])) myDB.action("UPDATE authors set HaveBooks=? where AuthorName=?", [havebooks, auth]) totalbooks = len(myDB.select("select BookName from Books where status!=? and AuthorName=?", ["Ignored", auth])) myDB.action("UPDATE authors set UnignoredBooks=? where AuthorName=?", [totalbooks, auth]) logger.info("Library scan complete")
def find_book_in_db(myDB, author, book): # PAB fuzzy search for book in library, return LL bookid if found or zero # if not, return bookid to more easily update status # prefer an exact match on author & book match = myDB.match('SELECT BookID FROM books where AuthorName="%s" and BookName="%s"' % (author.replace('"', '""'), book.replace('"', '""'))) if match: logger.debug('Exact match [%s]' % book) return match['BookID'] else: # Try a more complex fuzzy match against each book in the db by this author # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>85) # These are results that work well on my library, minimal false matches and no misses # on books that should be matched # Maybe make ratios configurable in config.ini later books = myDB.select('SELECT BookID,BookName,BookISBN FROM books where AuthorName="%s"' % author.replace('"', '""')) best_ratio = 0 best_partial = 0 best_partname = 0 ratio_name = "" partial_name = "" partname_name = "" ratio_id = 0 partial_id = 0 partname_id = 0 partname = 0 book_lower = unaccented(book.lower()) book_partname, book_sub = split_title(author, book_lower) if book_partname == book_lower: book_partname = '' for a_book in books: # tidy up everything to raise fuzziness scores # still need to lowercase for matching against partial_name later on a_book_lower = unaccented(a_book['BookName'].lower()) # ratio = fuzz.ratio(book_lower, a_book_lower) partial = fuzz.partial_ratio(book_lower, a_book_lower) if book_partname: partname = fuzz.partial_ratio(book_partname, a_book_lower) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(book_lower)) words -= len(getList(a_book_lower)) ratio -= abs(words) partial -= abs(words) if ratio > best_ratio: best_ratio = ratio ratio_name = a_book['BookName'] ratio_id = a_book['BookID'] if partial > best_partial: best_partial = partial partial_name = a_book['BookName'] partial_id = a_book['BookID'] if partname > best_partname: best_partname = partname partname_name = a_book['BookName'] partname_id = a_book['BookID'] if partial == best_partial: # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest # this eliminates most false matches against omnibuses when we want a single book # find the position of the shortest string in the longest if len(getList(book_lower)) >= len(getList(a_book_lower)): match1 = book_lower.find(a_book_lower) else: match1 = a_book_lower.find(book_lower) if len(getList(book_lower)) >= len(getList(partial_name.lower())): match2 = book_lower.find(partial_name.lower()) else: match2 = partial_name.lower().find(book_lower) if match1 < match2: logger.debug( "Fuzz left change, prefer [%s] over [%s] for [%s]" % (a_book['BookName'], partial_name, book)) best_partial = partial partial_name = a_book['BookName'] partial_id = a_book['BookID'] if best_ratio > 90: logger.debug( "Fuzz match ratio [%d] [%s] [%s]" % (best_ratio, book, ratio_name)) return ratio_id if best_partial > 85: logger.debug( "Fuzz match partial [%d] [%s] [%s]" % (best_partial, book, partial_name)) return partial_id if best_partname > 95: logger.debug( "Fuzz match partname [%d] [%s] [%s]" % (best_partname, book, partname_name)) return partname_id logger.debug( 'Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s], partname [%d,%s]' % (author, book, best_ratio, ratio_name, best_partial, partial_name, best_partname, partname_name)) return 0
def find_results(self, searchterm=None, queue=None): """ GoogleBooks performs much better if we search for author OR title not both at once, so if searchterm is not isbn, two searches needed. Lazylibrarian searches use <ll> to separate title from author in searchterm If this token isn't present, it's an isbn or searchterm as supplied by user """ try: myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(searchterm): api_strings = ['isbn:'] api_hits = 0 ignored = 0 total_count = 0 no_author_count = 0 title = '' authorname = '' if ' <ll> ' in searchterm: # special token separates title from author title, authorname = searchterm.split(' <ll> ') fullterm = searchterm.replace(' <ll> ', ' ') logger.debug('Now searching Google Books API with searchterm: %s' % fullterm) for api_value in api_strings: set_url = self.url if api_value == "isbn:": set_url = set_url + quote(api_value + searchterm) elif api_value == 'intitle:': searchterm = fullterm if title: # just search for title # noinspection PyUnresolvedReferences title = title.split(' (')[0] # without any series info searchterm = title searchterm = searchterm.replace("'", "").replace('"', '').strip() # and no quotes if PY2: searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) set_url = set_url + quote(api_value + '"' + searchterm + '"') elif api_value == 'inauthor:': searchterm = fullterm if authorname: searchterm = authorname # just search for author searchterm = searchterm.strip() if PY2: searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) set_url = set_url + quote_plus(api_value + '"' + searchterm + '"') startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 try: while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urlencode(self.params) try: jsonresults, in_cache = gb_json_request(URL) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits += 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn('Found no results for %s with value: %s' % (api_value, searchterm)) break else: pass except Exception as err: if hasattr(err, 'reason'): errmsg = err.reason else: errmsg = str(err) logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % errmsg) break startindex += 40 for item in jsonresults['items']: total_count += 1 book = bookdict(item) if not book['author']: logger.debug('Skipped a result without authorfield.') no_author_count += 1 continue if not book['name']: logger.debug('Skipped a result without title.') continue valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG']) if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = book['lang'] if booklang not in valid_langs: logger.debug( 'Skipped %s with language %s' % (book['name'], booklang)) ignored += 1 continue except KeyError: ignored += 1 logger.debug('Skipped %s where no language is found' % book['name']) continue if authorname: author_fuzz = fuzz.ratio(book['author'], authorname) else: author_fuzz = fuzz.ratio(book['author'], fullterm) if title: book_fuzz = fuzz.token_set_ratio(book['name'], title) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(book['name'])) words -= len(getList(title)) book_fuzz -= abs(words) else: book_fuzz = fuzz.token_set_ratio(book['name'], fullterm) isbn_fuzz = 0 if is_valid_isbn(fullterm): isbn_fuzz = 100 highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz) dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(book['name'], dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace AuthorID = '' if book['author']: match = myDB.match( 'SELECT AuthorID FROM authors WHERE AuthorName=?', ( book['author'].replace('"', '""'),)) if match: AuthorID = match['AuthorID'] resultlist.append({ 'authorname': book['author'], 'authorid': AuthorID, 'bookid': item['id'], 'bookname': bookname, 'booksub': book['sub'], 'bookisbn': book['isbn'], 'bookpub': book['pub'], 'bookdate': book['date'], 'booklang': book['lang'], 'booklink': book['link'], 'bookrate': float(book['rate']), 'bookrate_count': book['rate_count'], 'bookimg': book['img'], 'bookpages': book['pages'], 'bookgenre': book['genre'], 'bookdesc': book['desc'], 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': book['ratings'] }) resultcount += 1 except KeyError: break logger.debug("Returning %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, searchterm)) logger.debug("Found %s result%s" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug('The Google Books API was hit %s time%s for searchterm: %s' % (api_hits, plural(api_hits), fullterm)) queue.put(resultlist) except Exception: logger.error('Unhandled exception in GB.find_results: %s' % traceback.format_exc())
def find_results(self, authorname=None, queue=None): threading.currentThread().name = "GB-SEARCH" resultlist = [] # See if we should check ISBN field, otherwise ignore it try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): api_strings = ["isbn:"] else: api_strings = ["inauthor:", "intitle:"] except: api_strings = ["inauthor:", "intitle:"] api_hits = 0 logger.info("Now searching Google Books API with keyword: " + self.name) for api_value in api_strings: startindex = 0 if api_value == "isbn:": set_url = self.url + urllib.quote(api_value + self.name) else: set_url = self.url + urllib.quote(api_value + '"' + self.name + '"') try: startindex = 0 resultcount = 0 removedResults = 0 ignored = 0 total_count = 0 no_author_count = 0 while True: self.params["startIndex"] = startindex URL = set_url + "&" + urllib.urlencode(self.params) try: jsonresults = json.JSONDecoder().decode(urllib2.urlopen(URL, timeout=30).read()) api_hits = api_hits + 1 number_results = jsonresults["totalItems"] logger.debug("Searching url: " + URL) if number_results == 0: logger.info("Found no results for %s with value: %s" % (api_value, self.name)) break else: pass except HTTPError, err: logger.warn("Google Books API Error [%s]: Check your API key or wait a while" % err.msg) break startindex = startindex + 40 for item in jsonresults["items"]: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item["volumeInfo"]["authors"][0] except KeyError: logger.debug("Skipped a result without authorfield.") no_author_count = no_author_count + 1 continue try: # skip if language is in ignore list booklang = item["volumeInfo"]["language"] valid_langs = [valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(",")] if booklang not in valid_langs: logger.debug("Skipped a book with language %s" % booklang) ignored = ignored + 1 continue except KeyError: ignored = ignored + 1 logger.debug("Skipped a result where no language is found") continue try: bookpub = item["volumeInfo"]["publisher"] except KeyError: bookpub = None try: booksub = item["volumeInfo"]["subtitle"] except KeyError: booksub = None try: bookdate = item["volumeInfo"]["publishedDate"] except KeyError: bookdate = "0000-00-00" bookdate = bookdate[:4] try: bookimg = item["volumeInfo"]["imageLinks"]["thumbnail"] except KeyError: bookimg = "images/nocover.png" try: bookrate = item["volumeInfo"]["averageRating"] except KeyError: bookrate = 0 try: bookpages = item["volumeInfo"]["pageCount"] except KeyError: bookpages = "0" try: bookgenre = item["volumeInfo"]["categories"][0] except KeyError: bookgenre = None try: bookdesc = item["volumeInfo"]["description"] except KeyError: bookdesc = "Not available" try: num_reviews = item["volumeInfo"]["ratingsCount"] except KeyError: num_reviews = 0 try: if item["volumeInfo"]["industryIdentifiers"][0]["type"] == "ISBN_10": bookisbn = item["volumeInfo"]["industryIdentifiers"][0]["identifier"] else: bookisbn = 0 except KeyError: bookisbn = 0 author_fuzz = fuzz.ratio(Author.lower(), authorname.lower()) book_fuzz = fuzz.ratio(item["volumeInfo"]["title"].lower(), authorname.lower()) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) resultlist.append( { "authorname": Author, "bookid": item["id"], "bookname": item["volumeInfo"]["title"], "booksub": booksub, "bookisbn": bookisbn, "bookpub": bookpub, "bookdate": bookdate, "booklang": booklang, "booklink": item["volumeInfo"]["canonicalVolumeLink"], "bookrate": float(bookrate), "bookimg": bookimg, "bookpages": bookpages, "bookgenre": bookgenre, "bookdesc": bookdesc, "author_fuzz": author_fuzz, "book_fuzz": book_fuzz, "isbn_fuzz": isbn_fuzz, "highest_fuzz": highest_fuzz, "num_reviews": num_reviews, } ) resultcount = resultcount + 1 if startindex >= number_results: logger.debug("Found %s total results" % total_count) logger.debug("Removed %s bad language results" % ignored) logger.debug("Removed %s books with no author" % no_author_count) logger.info( "Showing %s results for (%s) with keyword: %s" % (resultcount, api_value, authorname) ) break else: continue except KeyError: break logger.info("The Google Books API was hit %s times for keyword %s" % (str(api_hits), self.name)) queue.put(resultlist)
def find_results(self, searchterm=None, queue=None): try: resultlist = [] api_hits = 0 searchtitle = '' searchauthorname = '' if ' <ll> ' in searchterm: # special token separates title from author searchtitle, searchauthorname = searchterm.split(' <ll> ') searchterm = searchterm.replace(' <ll> ', ' ') searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) url = urllib.quote_plus(searchterm) set_url = 'https://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params) logger.debug('Now searching GoodReads API with searchterm: %s' % searchterm) # logger.debug('Searching for %s at: %s' % (searchterm, set_url)) resultcount = 0 try: try: rootxml, in_cache = get_xml_request(set_url) except Exception as e: logger.error("%s finding gr results: %s" % (type(e).__name__, str(e))) return if rootxml is None: logger.debug("Error requesting results") return totalresults = check_int(rootxml.find('search/total-results').text, 0) resultxml = rootxml.getiterator('work') loopCount = 1 while resultxml: for author in resultxml: try: if author.find('original_publication_year').text is None: bookdate = "0000" else: bookdate = author.find('original_publication_year').text except (KeyError, AttributeError): bookdate = "0000" try: authorNameResult = author.find('./best_book/author/name').text # Goodreads sometimes puts extra whitepase in the author names! authorNameResult = ' '.join(authorNameResult.split()) except (KeyError, AttributeError): authorNameResult = "" booksub = "" bookpub = "" booklang = "Unknown" try: bookimg = author.find('./best_book/image_url').text if bookimg == 'https://www.goodreads.com/assets/nocover/111x148.png': bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' try: bookrate = author.find('average_rating').text except KeyError: bookrate = 0 bookpages = '0' bookgenre = '' bookdesc = '' bookisbn = '' try: booklink = 'https://www.goodreads.com/book/show/' + author.find('./best_book/id').text except (KeyError, AttributeError): booklink = "" try: authorid = author.find('./best_book/author/id').text except (KeyError, AttributeError): authorid = "" try: if author.find('./best_book/title').text is None: bookTitle = "" else: bookTitle = author.find('./best_book/title').text except (KeyError, AttributeError): bookTitle = "" if searchauthorname: author_fuzz = fuzz.ratio(authorNameResult, searchauthorname) else: author_fuzz = fuzz.ratio(authorNameResult, searchterm) if searchtitle: book_fuzz = fuzz.token_set_ratio(bookTitle, searchtitle) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(bookTitle)) words -= len(getList(searchtitle)) book_fuzz -= abs(words) else: book_fuzz = fuzz.token_set_ratio(bookTitle, searchterm) words = len(getList(bookTitle)) words -= len(getList(searchterm)) book_fuzz -= abs(words) isbn_fuzz = 0 if is_valid_isbn(searchterm): isbn_fuzz = 100 highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz) try: bookid = author.find('./best_book/id').text except (KeyError, AttributeError): bookid = "" resultlist.append({ 'authorname': authorNameResult, 'bookid': bookid, 'authorid': authorid, 'bookname': bookTitle, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': booklink, 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': float(bookrate) }) resultcount += 1 loopCount += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < loopCount: resultxml = None logger.warn('Maximum results page search reached, still more results available') elif totalresults and resultcount >= totalresults: # fix for goodreads bug on isbn searches resultxml = None else: URL = set_url + '&page=' + str(loopCount) resultxml = None try: rootxml, in_cache = get_xml_request(URL) if rootxml is None: logger.debug('Error requesting page %s of results' % loopCount) else: resultxml = rootxml.getiterator('work') if not in_cache: api_hits += 1 except Exception as e: resultxml = None logger.error("%s finding page %s of results: %s" % (type(e).__name__, loopCount, str(e))) if resultxml: if all(False for _ in resultxml): # returns True if iterator is empty resultxml = None except Exception as err: if err.code == 404: logger.error('Received a 404 error when searching for author') if err.code == 403: logger.warn('Access to api is denied: usage exceeded') else: logger.error('An unexpected error has occurred when searching for an author: %s' % str(err)) logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), searchterm)) logger.debug( 'The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), searchterm)) queue.put(resultlist) except Exception: logger.error('Unhandled exception in GR.find_results: %s' % traceback.format_exc())
def find_results(self, authorname=None, queue=None): threading.currentThread().name = "GR-SEARCH" resultlist = [] api_hits = 0 url = urllib.quote_plus(authorname.encode("utf-8")) set_url = "http://www.goodreads.com/search.xml?q=" + url + "&" + urllib.urlencode(self.params) logger.info("Now searching GoodReads API with keyword: " + authorname) logger.debug("Searching for %s at: %s" % (authorname, set_url)) try: try: # Cache our request request = urllib2.Request(set_url) opener = urllib2.build_opener( SimpleCache.CacheHandler(".AuthorCache"), SimpleCache.ThrottlingProcessor(5) ) resp = opener.open(request) api_hits = api_hits + 1 sourcexml = ElementTree.parse(resp) except Exception, e: logger.error("Error finding results: " + str(e)) rootxml = sourcexml.getroot() resultxml = rootxml.getiterator("work") author_dict = [] resultcount = 0 for author in resultxml: bookdate = "0001-01-01" if author.find("original_publication_year").text == None: bookdate = "0000" else: bookdate = author.find("original_publication_year").text authorNameResult = author.find("./best_book/author/name").text booksub = "" bookpub = "" booklang = "en" try: bookimg = author.find("./best_book/image_url").text if bookimg == "http://www.goodreads.com/assets/nocover/111x148.png": bookimg = "images/nocover.png" except KeyError: bookimg = "images/nocover.png" except AttributeError: bookimg = "images/nocover.png" try: bookrate = author.find("average_rating").text except KeyError: bookrate = 0 bookpages = "0" bookgenre = "" bookdesc = "" bookisbn = "" booklink = "http://www.goodreads.com/book/show/" + author.find("./best_book/id").text if author.find("./best_book/title").text == None: bookTitle = "" else: bookTitle = author.find("./best_book/title").text author_fuzz = fuzz.ratio(authorNameResult.lower(), authorname.lower()) book_fuzz = fuzz.ratio(bookTitle.lower(), authorname.lower()) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) resultlist.append( { "authorname": author.find("./best_book/author/name").text, "bookid": author.find("./best_book/id").text, "authorid": author.find("./best_book/author/id").text, "bookname": bookTitle.encode("ascii", "ignore"), "booksub": booksub, "bookisbn": bookisbn, "bookpub": bookpub, "bookdate": bookdate, "booklang": booklang, "booklink": booklink, "bookrate": float(bookrate), "bookimg": bookimg, "bookpages": bookpages, "bookgenre": bookgenre, "bookdesc": bookdesc, "author_fuzz": author_fuzz, "book_fuzz": book_fuzz, "isbn_fuzz": isbn_fuzz, "highest_fuzz": highest_fuzz, "num_reviews": float(bookrate), } ) resultcount = resultcount + 1
def find_results(self, authorname=None, queue=None): threading.currentThread().name = "GR-SEARCH" resultlist = [] api_hits = 0 url = urllib.quote_plus(authorname.encode('utf-8')) set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params) logger.info('Now searching GoodReads API with keyword: ' + authorname) logger.debug('Searching for %s at: %s' % (authorname, set_url)) try: try: # Cache our request request = urllib2.Request(set_url) if lazylibrarian.PROXY_HOST: request.set_proxy(lazylibrarian.PROXY_HOST, lazylibrarian.PROXY_TYPE) request.add_header('User-Agent', USER_AGENT) opener = urllib2.build_opener(SimpleCache.CacheHandler(".AuthorCache"), SimpleCache.ThrottlingProcessor(5)) resp = opener.open(request) api_hits = api_hits + 1 sourcexml = ElementTree.parse(resp) except Exception, e: logger.error("Error finding results: " + str(e)) rootxml = sourcexml.getroot() resultxml = rootxml.getiterator('work') author_dict = [] resultcount = 0 for author in resultxml: bookdate = "0001-01-01" if (author.find('original_publication_year').text == None): bookdate = "0000" else: bookdate = author.find('original_publication_year').text authorNameResult = author.find('./best_book/author/name').text booksub = "" bookpub = "" booklang = "en" try: bookimg = author.find('./best_book/image_url').text if (bookimg == 'http://www.goodreads.com/assets/nocover/111x148.png'): bookimg = 'images/nocover.png' except KeyError: bookimg = 'images/nocover.png' except AttributeError: bookimg = 'images/nocover.png' try: bookrate = author.find('average_rating').text except KeyError: bookrate = 0 bookpages = '0' bookgenre = '' bookdesc = '' bookisbn = '' booklink = 'http://www.goodreads.com/book/show/'+author.find('./best_book/id').text if (author.find('./best_book/title').text == None): bookTitle = "" else: bookTitle = author.find('./best_book/title').text author_fuzz = fuzz.ratio(authorNameResult.lower(), authorname.lower()) book_fuzz = fuzz.ratio(bookTitle.lower(), authorname.lower()) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) resultlist.append({ 'authorname': author.find('./best_book/author/name').text, 'bookid': author.find('./best_book/id').text, 'authorid' : author.find('./best_book/author/id').text, 'bookname': bookTitle.encode("ascii", "ignore"), 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': booklink, 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': float(bookrate) }) resultcount = resultcount+1
def search_magazines(mags=None): # produce a list of magazines to search for, tor, nzb, torznab myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Frequency, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, Frequency, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 1: logger.info('Searching for one magazine') else: logger.info('Searching for %i magazines' % len(searchmags)) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB: resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR: tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 to_snatch = 0 maglist = [] issues = [] for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = (u'%s' % nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] frequency = results['Frequency'] # regex = results['Regex'] nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace( '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) # keyword_check = nzbtitle_formatted.replace(bookid, '') # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now name_len = len(bookid_exploded) if len(nzbtitle_exploded) > name_len: # needs to be longer as it should include a date while name_len: name_len = name_len - 1 # fuzzy check on each word in the magazine name with any accents stripped # fuzz.ratio doesn't lowercase for us ratio = fuzz.ratio(common.remove_accents(nzbtitle_exploded[name_len].lower()), common.remove_accents(bookid_exploded[name_len].lower())) if ratio < 80: # hard coded fuzz ratio for now, works for close matches logger.debug("Magazine fuzz ratio failed [%d] [%s] [%s]" % ( ratio, bookid, nzbtitle_formatted)) name_match = 0 # name match failed if name_match: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf': nzbtitle_exploded.pop() # gotta love the function names if len(nzbtitle_exploded) > 1: # regexA = DD MonthName YYYY OR MonthName YYYY or nn MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexA_year.isdigit(): if int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'Invalid' regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp)) if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY else: regexA_day = '01' # monthly, or less frequent newdatish_regexA = regexA_year + regexA_month + regexA_day try: int(newdatish_regexA) newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day except: # regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp)) newdatish_regexB = regexB_year + regexB_month + regexB_day try: int(newdatish_regexB) newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day except: # regexC = YYYY MM or YYYY MM DD or Issue nn YYYY # (can't get MM/DD if named Issue nn) newdatish_regexC = 'Invalid' # invalid unless works out otherwise regexC_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_temp.isdigit(): if int(regexC_temp) > 1900 and int(regexC_temp) < 2100: # YYYY MM or YYYY nn regexC_year = regexC_temp regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' if regexC_month.isdigit(): # could be YYYY nn where nn is issue number if int(regexC_month) < 13: # if issue number > 12 date matching will fail newdatish_regexC = regexC_year + regexC_month + regexC_day else: regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if regexC_year.isdigit(): if int(regexC_year) > 1900 and int(regexC_year) < 2100: # YYYY MM DD or YYYY nn-nn regexC_month = regexC_temp.zfill(2) if int(regexC_month) < 13: # if issue number > 12 date matching will fail regexC_day = nzbtitle_exploded[len( nzbtitle_exploded) - 1].zfill(2) newdatish_regexC = regexC_year + regexC_month + regexC_day try: int(newdatish_regexC) newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day except: logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: continue # Don't want to overwrite status = Skipped for NZBs that have been previously found wanted_status = myDB.select('SELECT * from wanted WHERE NZBtitle="%s"' % nzbtitle) if wanted_status: for results in wanted_status: status = results['Status'] else: status = "Skipped" controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert("wanted", newValueDict, controlValueDict) if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: # Should probably only upsert when downloaded and processed in case snatch fails # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) to_snatch = to_snatch + 1 issues.append(issue) else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug('Magazine [%s] does not completely match search term [%s].' % ( nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 logger.info('Found %s results for %s. %s are new, %s are old, %s fail date, %s fail name matching' % ( total_nzbs, bookid, new_date, old_date, bad_date, bad_regex)) logger.info("%s, %s issues to download" % (bookid, to_snatch)) for items in maglist: if items['nzbmode'] == "torznab": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) elif items['nzbmode'] == "torrent": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) else: snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) if snatch: notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now()) postprocess.schedule_processor(action='Start') maglist = [] logger.info("Search for magazines complete")
def find_book_in_db(myDB, author, book): # PAB fuzzy search for book in library, return LL bookid if found or zero # if not, return bookid to more easily update status # prefer an exact match on author & book match = myDB.action( 'SELECT BookID FROM books where AuthorName="%s" and BookName="%s"' % (author, book)).fetchone() if match: logger.debug('Exact match [%s]' % book) return match['BookID'] else: # No exact match # Try a more complex fuzzy match against each book in the db by this author # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>65) # These are results that work well on my library, minimal false matches and no misses # on books that should be matched # Maybe make ratios configurable in config.ini later books = myDB.select('SELECT BookID,BookName FROM books where AuthorName="%s"' % author) best_ratio = 0 best_partial = 0 ratio_name = "" partial_name = "" ratio_id = 0 partial_id = 0 #logger.debug("Found %s books for %s" % (len(books), author)) for a_book in books: # tidy up everything to raise fuzziness scores # still need to lowercase for matching against partial_name later on book_lower = common.remove_accents(book.lower()) a_book_lower = common.remove_accents(a_book['BookName'].lower()) # ratio = fuzz.ratio(book_lower, a_book_lower) partial = fuzz.partial_ratio(book_lower, a_book_lower) if ratio > best_ratio: best_ratio = ratio ratio_name = a_book['BookName'] ratio_id = a_book['BookID'] if partial > best_partial: best_partial = partial partial_name = a_book['BookName'] partial_id = a_book['BookID'] else: if partial == best_partial: # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest # this eliminates most false matches against omnibuses if a_book_lower.find(book_lower) < partial_name.lower().find(book_lower): logger.debug( "Fuzz left prefer [%s] over [%s]" % (a_book['BookName'], partial_name)) best_partial = partial partial_name = a_book['BookName'] partial_id = a_book['BookID'] # if best_ratio > 90: logger.debug( "Fuzz match ratio [%d] [%s] [%s]" % (best_ratio, book, ratio_name)) return ratio_id if best_partial > 65: logger.debug( "Fuzz match partial [%d] [%s] [%s]" % (best_partial, book, partial_name)) return partial_id logger.debug( 'Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s]' % (author, book, best_ratio, ratio_name, best_partial, partial_name)) return 0
def LibraryScan(startdir=None): """ Scan a directory tree adding new books into database Return how many books you added """ if not startdir: if not lazylibrarian.DESTINATION_DIR: return 0 else: startdir = lazylibrarian.DESTINATION_DIR if not os.path.isdir(startdir): logger.warn( 'Cannot find directory: %s. Not scanning' % startdir) return 0 myDB = database.DBConnection() # keep statistics of full library scans if startdir == lazylibrarian.DESTINATION_DIR: myDB.action('DELETE from stats') logger.info('Scanning ebook directory: %s' % startdir) new_book_count = 0 file_count = 0 author = "" if lazylibrarian.FULL_SCAN and startdir == lazylibrarian.DESTINATION_DIR: books = myDB.select( 'select AuthorName, BookName, BookFile, BookID from books where Status="Open"') status = lazylibrarian.NOTFOUND_STATUS logger.info('Missing books will be marked as %s' % status) for book in books: bookName = book['BookName'] bookAuthor = book['AuthorName'] bookID = book['BookID'] bookfile = book['BookFile'] if not(bookfile and os.path.isfile(bookfile)): myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID)) myDB.action('update books set BookFile="" where BookID="%s"' % bookID) logger.warn('Book %s - %s updated as not found on disk' % (bookAuthor, bookName)) # to save repeat-scans of the same directory if it contains multiple formats of the same book, # keep track of which directories we've already looked at processed_subdirectories = [] matchString = '' for char in lazylibrarian.EBOOK_DEST_FILE: matchString = matchString + '\\' + char # massage the EBOOK_DEST_FILE config parameter into something we can use # with regular expression matching booktypes = '' count = -1 booktype_list = getList(lazylibrarian.EBOOK_TYPE) for book_type in booktype_list: count += 1 if count == 0: booktypes = book_type else: booktypes = booktypes + '|' + book_type matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace( "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']' pattern = re.compile(matchString, re.VERBOSE) for r, d, f in os.walk(startdir): for directory in d[:]: # prevent magazine being scanned if directory.startswith("_") or directory.startswith("."): d.remove(directory) for files in f: file_count += 1 if isinstance(r, str): r = r.decode(lazylibrarian.SYS_ENCODING) subdirectory = r.replace(startdir, '') # Added new code to skip if we've done this directory before. # Made this conditional with a switch in config.ini # in case user keeps multiple different books in the same subdirectory if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories): logger.debug("[%s] already scanned" % subdirectory) else: # If this is a book, try to get author/title/isbn/language # if epub or mobi, read metadata from the book # If metadata.opf exists, use that allowing it to override # embedded metadata. User may have edited metadata.opf # to merge author aliases together # If all else fails, try pattern match for author/title # and look up isbn/lang from LT or GR later match = 0 if is_valid_booktype(files): logger.debug("[%s] Now scanning subdirectory %s" % (startdir, subdirectory)) language = "Unknown" isbn = "" book = "" author = "" extn = os.path.splitext(files)[1] # if it's an epub or a mobi we can try to read metadata from it if (extn == ".epub") or (extn == ".mobi"): book_filename = os.path.join( r.encode(lazylibrarian.SYS_ENCODING), files.encode(lazylibrarian.SYS_ENCODING)) try: res = get_book_info(book_filename) except: res = {} if 'title' in res and 'creator' in res: # this is the minimum we need match = 1 book = res['title'] author = res['creator'] if 'language' in res: language = res['language'] if 'identifier' in res: isbn = res['identifier'] if 'type' in res: extn = res['type'] logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" % (isbn, language, author, book, extn)) else: logger.debug("Book meta incomplete in %s" % book_filename) # calibre uses "metadata.opf", LL uses "bookname - authorname.opf" # just look for any .opf file in the current directory since we don't know # LL preferred authorname/bookname at this point. # Allow metadata in file to override book contents as may be users pref metafile = opf_file(r) try: res = get_book_info(metafile) except: res = {} if 'title' in res and 'creator' in res: # this is the minimum we need match = 1 book = res['title'] author = res['creator'] if 'language' in res: language = res['language'] if 'identifier' in res: isbn = res['identifier'] logger.debug( "file meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book)) else: logger.debug("File meta incomplete in %s" % metafile) if not match: # no author/book from metadata file, and not embedded either match = pattern.match(files) if match: author = match.group("author") book = match.group("book") else: logger.debug("Pattern match failed [%s]" % files) if match: # flag that we found a book in this subdirectory processed_subdirectories.append(subdirectory) # If we have a valid looking isbn, and language != "Unknown", add it to cache if language != "Unknown" and is_valid_isbn(isbn): logger.debug( "Found Language [%s] ISBN [%s]" % (language, isbn)) # we need to add it to language cache if not already # there, is_valid_isbn has checked length is 10 or 13 if len(isbn) == 10: isbnhead = isbn[0:3] else: isbnhead = isbn[3:6] match = myDB.action( 'SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone() if not match: myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, language)) logger.debug( "Cached Lang [%s] ISBN [%s]" % (language, isbnhead)) else: logger.debug( "Already cached Lang [%s] ISBN [%s]" % (language, isbnhead)) # get authors name in a consistent format if "," in author: # "surname, forename" words = author.split(',') author = words[1].strip() + ' ' + words[0].strip() # "forename surname" if author[1] == ' ': author = author.replace(' ', '.') author = author.replace('..', '.') # Check if the author exists, and import the author if not, # before starting any complicated book-name matching to save repeating the search # check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author).fetchone() if not check_exist_author and lazylibrarian.ADD_AUTHOR: # no match for supplied author, but we're allowed to # add new ones GR = GoodReads(author) try: author_gr = GR.find_author_id() except: logger.warn( "Error finding author id for [%s]" % author) continue # only try to add if GR data matches found author data if author_gr: authorname = author_gr['authorname'] # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien" match_auth = author.replace('.', '_') match_auth = match_auth.replace(' ', '_') match_auth = match_auth.replace('__', '_') match_name = authorname.replace('.', '_') match_name = match_name.replace(' ', '_') match_name = match_name.replace('__', '_') match_name = unaccented(match_name) match_auth = unaccented(match_auth) # allow a degree of fuzziness to cater for different accented character handling. # some author names have accents, # filename may have the accented or un-accented version of the character # The currently non-configurable value of fuzziness might need to go in config # We stored GoodReads unmodified author name in # author_gr, so store in LL db under that # fuzz.ratio doesn't lowercase for us match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower()) if match_fuzz < 90: logger.debug( "Failed to match author [%s] fuzz [%d]" % (author, match_fuzz)) logger.debug( "Failed to match author [%s] to authorname [%s]" % (match_auth, match_name)) # To save loading hundreds of books by unknown # authors at GR or GB, ignore if author "Unknown" if (author != "Unknown") and (match_fuzz >= 90): # use "intact" name for author that we stored in # GR author_dict, not one of the various mangled versions # otherwise the books appear to be by a different author! author = author_gr['authorname'] # this new authorname may already be in the # database, so check again check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author).fetchone() if not check_exist_author: logger.info( "Adding new author [%s]" % author) try: addAuthorToDB(author) check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author).fetchone() except: continue # check author exists in db, either newly loaded or already there if not check_exist_author: logger.debug( "Failed to match author [%s] in database" % author) else: # author exists, check if this book by this author is in our database # metadata might have quotes in book name book = book.replace('"', '').replace("'", "") bookid = find_book_in_db(myDB, author, book) if bookid: # check if book is already marked as "Open" (if so, # we already had it) check_status = myDB.action( 'SELECT Status from books where BookID="%s"' % bookid).fetchone() if check_status['Status'] != 'Open': # update status as we've got this book myDB.action( 'UPDATE books set Status="Open" where BookID="%s"' % bookid) book_filename = os.path.join(r, files) # update book location so we can check if it # gets removed, or allow click-to-open myDB.action( 'UPDATE books set BookFile="%s" where BookID="%s"' % (book_filename, bookid)) # update cover file to cover.jpg in book folder (if exists) bookdir = book_filename.rsplit(os.sep, 1)[0] coverimg = os.path.join(bookdir, 'cover.jpg') cachedir = os.path.join(str(lazylibrarian.PROG_DIR), 'data' + os.sep + 'images' + os.sep + 'cache') cacheimg = os.path.join(cachedir, bookid + '.jpg') if os.path.isfile(coverimg): copyfile(coverimg, cacheimg) new_book_count += 1 else: logger.debug( "Failed to match book [%s] by [%s] in database" % (book, author)) logger.info("%s new/modified book%s found and added to the database" % (new_book_count, plural(new_book_count))) logger.info("%s file%s processed" % (file_count, plural(file_count))) # show statistics of full library scans if startdir == lazylibrarian.DESTINATION_DIR: stats = myDB.action( "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \ sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached), sum(duplicates) FROM stats").fetchone() if stats['sum(GR_book_hits)'] is not None: # only show stats if new books added if lazylibrarian.BOOK_API == "GoogleBooks": logger.debug("GoogleBooks was hit %s time%s for books" % (stats['sum(GR_book_hits)'], plural(stats['sum(GR_book_hits)']))) logger.debug("GoogleBooks language was changed %s time%s" % (stats['sum(GB_lang_change)'], plural(stats['sum(GB_lang_change)']))) if lazylibrarian.BOOK_API == "GoodReads": logger.debug("GoodReads was hit %s time%s for books" % (stats['sum(GR_book_hits)'], plural(stats['sum(GR_book_hits)']))) logger.debug("GoodReads was hit %s time%s for languages" % (stats['sum(GR_lang_hits)'], plural(stats['sum(GR_lang_hits)']))) logger.debug("LibraryThing was hit %s time%s for languages" % (stats['sum(LT_lang_hits)'], plural (stats['sum(LT_lang_hits)']))) logger.debug("Language cache was hit %s time%s" % (stats['sum(cache_hits)'], plural(stats['sum(cache_hits)']))) logger.debug("Unwanted language removed %s book%s" % (stats['sum(bad_lang)'], plural (stats['sum(bad_lang)']))) logger.debug("Unwanted characters removed %s book%s" % (stats['sum(bad_char)'], plural(stats['sum(bad_char)']))) logger.debug("Unable to cache %s book%s with missing ISBN" % (stats['sum(uncached)'], plural(stats['sum(uncached)']))) logger.debug("Found %s duplicate book%s" % (stats['sum(duplicates)'], plural(stats['sum(duplicates)']))) logger.debug("Cache %s hit%s, %s miss" % (lazylibrarian.CACHE_HIT, plural(lazylibrarian.CACHE_HIT), lazylibrarian.CACHE_MISS)) cachesize = myDB.action("select count('ISBN') as counter from languages").fetchone() logger.debug("ISBN Language cache holds %s entries" % cachesize['counter']) nolang = len(myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"')) if nolang: logger.warn("Found %s book%s in your library with unknown language" % (nolang, plural(nolang))) authors = myDB.select('select AuthorID from authors') # Update bookcounts for all authors, not just new ones - refresh may have located # new books for existing authors especially if switched provider gb/gr else: # single author/book import authors = myDB.select('select AuthorID from authors where AuthorName = "%s"' % author) logger.debug('Updating bookcounts for %i author%s' % (len(authors), plural(len(authors)))) for author in authors: update_totals(author['AuthorID']) images = myDB.select('select bookid, bookimg, bookname from books where bookimg like "http%"') if len(images): logger.info("Caching cover%s for %i book%s" % (plural(len(images)), len(images), plural(len(images)))) for item in images: bookid = item['bookid'] bookimg = item['bookimg'] bookname = item['bookname'] newimg = cache_cover(bookid, bookimg) if newimg is not None: myDB.action('update books set BookImg="%s" where BookID="%s"' % (newimg, bookid)) images = myDB.select('select AuthorID, AuthorImg, AuthorName from authors where AuthorImg like "http%"') if len(images): logger.info("Caching image%s for %i author%s" % (plural(len(images)), len(images), plural(len(images)))) for item in images: authorid = item['authorid'] authorimg = item['authorimg'] authorname = item['authorname'] newimg = cache_cover(authorid, authorimg) if newimg is not None: myDB.action('update authors set AuthorImg="%s" where AuthorID="%s"' % (newimg, authorid)) setWorkPages() logger.info('Library scan complete') return new_book_count
def addAuthorNameToDB(author=None, refresh=False, addbooks=True): # get authors name in a consistent format, look them up in the database # if not in database, try to import them. # return authorname,new where new=False if author already in db, new=True if added # authorname returned is our preferred name, or empty string if not found or unable to add myDB = database.DBConnection() new = False if len(author) < 2: logger.debug('Invalid Author Name [%s]' % author) return "", "", False author = formatAuthorName(author) # Check if the author exists, and import the author if not, check_exist_author = myDB.match( 'SELECT AuthorID FROM authors where AuthorName="%s"' % author.replace('"', '""')) if not check_exist_author and lazylibrarian.CONFIG['ADD_AUTHOR']: logger.debug('Author %s not found in database, trying to add' % author) # no match for supplied author, but we're allowed to add new ones GR = GoodReads(author) try: author_gr = GR.find_author_id() except Exception as e: logger.warn("Error finding author id for [%s] %s" % (author, str(e))) return "", "", False # only try to add if GR data matches found author data if author_gr: authorname = author_gr['authorname'] #authorid = author_gr['authorid'] # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien" match_auth = author.replace('.', ' ') match_auth = ' '.join(match_auth.split()) match_name = authorname.replace('.', ' ') match_name = ' '.join(match_name.split()) match_name = unaccented(match_name) match_auth = unaccented(match_auth) # allow a degree of fuzziness to cater for different accented character handling. # some author names have accents, # filename may have the accented or un-accented version of the character # The currently non-configurable value of fuzziness might need to go in config # We stored GoodReads unmodified author name in # author_gr, so store in LL db under that # fuzz.ratio doesn't lowercase for us match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower()) if match_fuzz < 90: logger.debug( "Failed to match author [%s] to authorname [%s] fuzz [%d]" % (author, match_name, match_fuzz)) # To save loading hundreds of books by unknown authors at GR or GB, ignore unknown if (author != "Unknown") and (match_fuzz >= 90): # use "intact" name for author that we stored in # GR author_dict, not one of the various mangled versions # otherwise the books appear to be by a different author! author = author_gr['authorname'] authorid = author_gr['authorid'] # this new authorname may already be in the # database, so check again check_exist_author = myDB.match( 'SELECT AuthorID FROM authors where AuthorID="%s"' % authorid) if check_exist_author: logger.debug('Found goodreads authorname %s in database' % author) else: logger.info("Adding new author [%s]" % author) try: addAuthorToDB(authorname=author, refresh=refresh, authorid=authorid, addbooks=addbooks) check_exist_author = myDB.match( 'SELECT AuthorID FROM authors where AuthorID="%s"' % authorid) if check_exist_author: new = True except Exception: logger.debug('Failed to add author [%s] to db' % author) # check author exists in db, either newly loaded or already there if not check_exist_author: logger.debug("Failed to match author [%s] in database" % author) return "", "", False return author, check_exist_author['AuthorID'], new
def find_results(self, searchterm=None, queue=None): try: resultlist = [] api_hits = 0 # we don't use the title/author separator in goodreads searchterm = searchterm.replace(' <ll> ', '') url = urllib.quote_plus(searchterm.encode(lazylibrarian.SYS_ENCODING)) set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params) logger.debug('Now searching GoodReads API with searchterm: %s' % searchterm) #logger.debug('Searching for %s at: %s' % (searchterm, set_url)) resultcount = 0 try: try: rootxml, in_cache = get_xml_request(set_url) except Exception as e: logger.error("Error finding gr results: %s" % str(e)) return if not len(rootxml): logger.debug("Error requesting results") return resultxml = rootxml.getiterator('work') for author in resultxml: if author.find('original_publication_year').text is None: bookdate = "0000" else: bookdate = author.find('original_publication_year').text authorNameResult = author.find('./best_book/author/name').text # Goodreads sometimes puts extra whitepase in the author names! authorNameResult = ' '.join(authorNameResult.split()) booksub = "" bookpub = "" booklang = "Unknown" try: bookimg = author.find('./best_book/image_url').text if bookimg == 'http://www.goodreads.com/assets/nocover/111x148.png': bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' try: bookrate = author.find('average_rating').text except KeyError: bookrate = 0 bookpages = '0' bookgenre = '' bookdesc = '' bookisbn = '' booklink = 'http://www.goodreads.com/book/show/' + author.find('./best_book/id').text if author.find('./best_book/title').text is None: bookTitle = "" else: bookTitle = author.find('./best_book/title').text author_fuzz = fuzz.ratio(authorNameResult, searchterm) book_fuzz = fuzz.ratio(bookTitle, searchterm) isbn_fuzz = 0 if is_valid_isbn(searchterm): isbn_fuzz = 100 highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz) bookid = author.find('./best_book/id').text resultlist.append({ 'authorname': author.find('./best_book/author/name').text, 'bookid': bookid, 'authorid': author.find('./best_book/author/id').text, 'bookname': bookTitle.encode("ascii", "ignore"), 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': booklink, 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': float(bookrate) }) resultcount += 1 except urllib2.HTTPError as err: if err.code == 404: logger.error('Received a 404 error when searching for author') if err.code == 403: logger.warn('Access to api is denied: usage exceeded') else: logger.error('An unexpected error has occurred when searching for an author: %s' % str(err)) logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), searchterm)) logger.debug( 'The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), searchterm)) queue.put(resultlist) except Exception: logger.error('Unhandled exception in GR.find_results: %s' % traceback.format_exc())
def LibraryScan(dir=None): if not dir: if not lazylibrarian.DOWNLOAD_DIR: return else: dir = lazylibrarian.DOWNLOAD_DIR if not os.path.isdir(dir): logger.warn( 'Cannot find directory: %s. Not scanning' % dir.decode(lazylibrarian.SYS_ENCODING, 'replace')) return myDB = database.DBConnection() myDB.action('drop table if exists stats') myDB.action( 'create table stats ( authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )') new_authors = [] logger.info( 'Scanning ebook directory: %s' % dir.decode(lazylibrarian.SYS_ENCODING, 'replace')) new_book_count = 0 file_count = 0 if lazylibrarian.FULL_SCAN: books = myDB.select( 'select AuthorName, BookName, BookFile, BookID from books where Status="Open"') status = lazylibrarian.NOTFOUND_STATUS logger.info('Missing books will be marked as %s' % status) for book in books: bookName = book['BookName'] bookAuthor = book['AuthorName'] bookID = book['BookID'] bookfile = book['BookFile'] if not(bookfile and os.path.isfile(bookfile)): myDB.action( 'update books set Status="%s" where BookID="%s"' % (status, bookID)) myDB.action( 'update books set BookFile="" where BookID="%s"' % bookID) logger.warn( 'Book %s - %s updated as not found on disk' % (bookAuthor, bookName)) # for book_type in getList(lazylibrarian.EBOOK_TYPE): # bookName = book['BookName'] # bookAuthor = book['AuthorName'] # Default destination path, should be allowed change per config file. # dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', bookAuthor).replace('$Title', bookName) # dest_path = authorname+'/'+bookname # global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', bookAuthor).replace('$Title', bookName) # # encoded_book_path = os.path.join(dir,dest_path,global_name + "." + book_type).encode(lazylibrarian.SYS_ENCODING) # if os.path.isfile(encoded_book_path): # book_exists = True # if not book_exists: # myDB.action('update books set Status=? where AuthorName=? and BookName=?',[status,bookAuthor,bookName]) # logger.info('Book %s updated as not found on disk' % # encoded_book_path.decode(lazylibrarian.SYS_ENCODING, 'replace') ) if bookAuthor not in new_authors: new_authors.append(bookAuthor) # guess this was meant to save repeat-scans of the same directory # if it contains multiple formats of the same book, but there was no code # that looked at the array. renamed from latest to processed to make # purpose clearer processed_subdirectories = [] matchString = '' for char in lazylibrarian.EBOOK_DEST_FILE: matchString = matchString + '\\' + char # massage the EBOOK_DEST_FILE config parameter into something we can use # with regular expression matching booktypes = '' count = -1 booktype_list = formatter.getList(lazylibrarian.EBOOK_TYPE) for book_type in booktype_list: count += 1 if count == 0: booktypes = book_type else: booktypes = booktypes + '|' + book_type matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace( "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']' pattern = re.compile(matchString, re.VERBOSE) for r, d, f in os.walk(dir): for directory in d[:]: if directory.startswith("."): d.remove(directory) # prevent magazine being scanned if directory.startswith("_"): d.remove(directory) for files in f: file_count += 1 subdirectory = r.replace(dir, '') # Added new code to skip if we've done this directory before. Made this conditional with a switch in config.ini # in case user keeps multiple different books in the same # subdirectory if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories): logger.debug("[%s] already scanned" % subdirectory) else: # If this is a book, try to get author/title/isbn/language # If metadata.opf exists, use that # else if epub or mobi, read metadata from the book # else have to try pattern match for author/title and look up isbn/lang from LT or GR late match = 0 extn = "" if '.' in files: words = files.split('.') extn = words[len(words) - 1] if formatter.is_valid_booktype(files): logger.debug( "[%s] Now scanning subdirectory %s" % (dir.decode(lazylibrarian.SYS_ENCODING, 'replace'), subdirectory.decode(lazylibrarian.SYS_ENCODING, 'replace'))) # calibre uses "metadata.opf", LL uses "bookname - authorname.opf" # just look for any .opf file in the current directory since we don't know # LL preferred authorname/bookname at this point metafile = opf_file(r) try: res = get_book_info(metafile) except: res = {} if 'title' in res and 'creator' in res: # this is the minimum we need book = res['title'] author = res['creator'] if 'language' in res: language = res['language'] else: language = "" if 'identifier' in res: isbn = res['identifier'] else: isbn = "" match = 1 logger.debug( "file meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book)) else: logger.debug("File meta incomplete in %s" % metafile) if not match: # it's a book, but no external metadata found # if it's an epub or a mobi we can try to read metadata # from it if (extn == "epub") or (extn == "mobi"): book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING) try: res = get_book_info(book_filename) except: res = {} if 'title' in res and 'creator' in res: # this is the minimum we need book = res['title'] author = res['creator'] if 'language' in res: language = res['language'] else: language = "" if 'identifier' in res: isbn = res['identifier'] else: isbn = "" logger.debug("book meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book)) match = 1 else: logger.debug("Book meta incomplete in %s" % book_filename) if not match: match = pattern.match(files) if match: author = match.group("author") book = match.group("book") else: logger.debug("Pattern match failed [%s]" % files) if match: processed_subdirectories.append( subdirectory) # flag that we found a book in this subdirectory # # If we have a valid looking isbn, and language != "Unknown", add it to cache # if not language: language = "Unknown" if not formatter.is_valid_isbn(isbn): isbn = "" if isbn != "" and language != "Unknown": logger.debug( "Found Language [%s] ISBN [%s]" % (language, isbn)) # we need to add it to language cache if not already # there, is_valid_isbn has checked length is 10 or 13 if len(isbn) == 10: isbnhead = isbn[0:3] else: isbnhead = isbn[3:6] match = myDB.action( 'SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone() if not match: myDB.action( 'insert into languages values ("%s", "%s")' % (isbnhead, language)) logger.debug( "Cached Lang [%s] ISBN [%s]" % (language, isbnhead)) else: logger.debug( "Already cached Lang [%s] ISBN [%s]" % (language, isbnhead)) # get authors name in a consistent format if "," in author: # "surname, forename" words = author.split(',') author = words[1].strip() + ' ' + words[0].strip() # "forename surname" if author[1] == ' ': author = author.replace(' ', '.') author = author.replace('..', '.') # Check if the author exists, and import the author if not, # before starting any complicated book-name matching to save repeating the search # check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author).fetchone() if not check_exist_author and lazylibrarian.ADD_AUTHOR: # no match for supplied author, but we're allowed to # add new ones GR = GoodReads(author) try: author_gr = GR.find_author_id() except: logger.warn( "Error finding author id for [%s]" % author) continue # only try to add if GR data matches found author data # not sure what this is for, never seems to fail?? if author_gr: authorname = author_gr['authorname'] # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien" match_auth = author.replace('.', '_') match_auth = match_auth.replace(' ', '_') match_auth = match_auth.replace('__', '_') match_name = authorname.replace('.', '_') match_name = match_name.replace(' ', '_') match_name = match_name.replace('__', '_') match_name = common.remove_accents(match_name) match_auth = common.remove_accents(match_auth) # allow a degree of fuzziness to cater for different accented character handling. # some author names have accents, # filename may have the accented or un-accented version of the character # The currently non-configurable value of fuzziness might need to go in config # We stored GoodReads unmodified author name in # author_gr, so store in LL db under that match_fuzz = fuzz.ratio(match_auth, match_name) if match_fuzz < 90: logger.debug( "Failed to match author [%s] fuzz [%d]" % (author, match_fuzz)) logger.debug( "Failed to match author [%s] to authorname [%s]" % (match_auth, match_name)) # To save loading hundreds of books by unknown # authors at GR or GB, ignore if author "Unknown" if (author != "Unknown") and (match_fuzz >= 90): # use "intact" name for author that we stored in # GR author_dict, not one of the various mangled versions # otherwise the books appear to be by a # different author! author = author_gr['authorname'] # this new authorname may already be in the # database, so check again check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author).fetchone() if not check_exist_author: logger.debug( "Adding new author [%s]" % author) if author not in new_authors: new_authors.append(author) try: importer.addAuthorToDB(author) check_exist_author = myDB.action( 'SELECT * FROM authors where AuthorName="%s"' % author).fetchone() except: continue # check author exists in db, either newly loaded or already # there if not check_exist_author: logger.debug( "Failed to match author [%s] in database" % author) else: # author exists, check if this book by this author is in our database # metadata might have quotes in book name book = book.replace('"', '').replace("'", "") bookid = find_book_in_db(myDB, author, book) if bookid: # check if book is already marked as "Open" (if so, # we already had it) check_status = myDB.action( 'SELECT Status from books where BookID="%s"' % bookid).fetchone() if check_status['Status'] != 'Open': # update status as we've got this book myDB.action( 'UPDATE books set Status="Open" where BookID="%s"' % bookid) book_filename = os.path.join( r, files).encode( lazylibrarian.SYS_ENCODING) # update book location so we can check if it # gets removed, or allow click-to-open myDB.action( 'UPDATE books set BookFile="%s" where BookID="%s"' % (book_filename, bookid)) new_book_count += 1 cachesize = myDB.action("select count(*) from languages").fetchone() logger.info( "%s new/modified books found and added to the database" % new_book_count) logger.info("%s files processed" % file_count) if new_book_count: stats = myDB.action( "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats").fetchone() if lazylibrarian.BOOK_API == "GoogleBooks": logger.debug( "GoogleBooks was hit %s times for books" % stats['sum(GR_book_hits)']) logger.debug( "GoogleBooks language was changed %s times" % stats['sum(GB_lang_change)']) if lazylibrarian.BOOK_API == "GoodReads": logger.debug( "GoodReads was hit %s times for books" % stats['sum(GR_book_hits)']) logger.debug( "GoodReads was hit %s times for languages" % stats['sum(GR_lang_hits)']) logger.debug( "LibraryThing was hit %s times for languages" % stats['sum(LT_lang_hits)']) logger.debug( "Language cache was hit %s times" % stats['sum(cache_hits)']) logger.debug( "Unwanted language removed %s books" % stats['sum(bad_lang)']) logger.debug( "Unwanted characters removed %s books" % stats['sum(bad_char)']) logger.debug( "Unable to cache %s books with missing ISBN" % stats['sum(uncached)']) logger.debug("ISBN Language cache holds %s entries" % cachesize['count(*)']) stats = len( myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"')) if stats: logger.warn( "There are %s books in your library with unknown language" % stats) logger.debug('Updating %i authors' % len(new_authors)) for auth in new_authors: havebooks = len( myDB.select('select BookName from Books where status="%s" and AuthorName="%s"' % ('Open', auth))) myDB.action( 'UPDATE authors set HaveBooks="%s" where AuthorName="%s"' % (havebooks, auth)) totalbooks = len( myDB.select('select BookName from Books where status!="%s" and AuthorName="%s"' % ('Ignored', auth))) myDB.action( 'UPDATE authors set UnignoredBooks="%s" where AuthorName="%s"' % (totalbooks, auth)) logger.info('Library scan complete')
def addAuthorNameToDB(author=None, refresh=False, addbooks=True): # get authors name in a consistent format, look them up in the database # if not in database, try to import them. # return authorname,authorid,new where new=False if author already in db, new=True if added # authorname returned is our preferred name, or empty string if not found or unable to add new = False if not author or len(author) < 2: logger.debug('Invalid Author Name [%s]' % author) return "", "", False author = formatAuthorName(author) myDB = database.DBConnection() # Check if the author exists, and import the author if not, check_exist_author = myDB.match('SELECT AuthorID FROM authors where AuthorName=?', (author,)) # If no exact match, look for a close fuzzy match to handle misspellings, accents if not check_exist_author: match_name = author.lower() res = myDB.action('select AuthorID,AuthorName from authors') for item in res: aname = item['AuthorName'] if aname: match_fuzz = fuzz.ratio(aname.lower(), match_name) if match_fuzz >= 95: logger.debug("Fuzzy match [%s] %s%% for [%s]" % (item['AuthorName'], match_fuzz, author)) check_exist_author = item author = item['AuthorName'] break if not check_exist_author and lazylibrarian.CONFIG['ADD_AUTHOR']: logger.debug('Author %s not found in database, trying to add' % author) # no match for supplied author, but we're allowed to add new ones GR = GoodReads(author) try: author_gr = GR.find_author_id() except Exception as e: logger.warn("%s finding author id for [%s] %s" % (type(e).__name__, author, str(e))) return "", "", False # only try to add if GR data matches found author data if author_gr: authorname = author_gr['authorname'] # authorid = author_gr['authorid'] # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien" match_auth = author.replace('.', ' ') match_auth = ' '.join(match_auth.split()) match_name = authorname.replace('.', ' ') match_name = ' '.join(match_name.split()) match_name = unaccented(match_name) match_auth = unaccented(match_auth) # allow a degree of fuzziness to cater for different accented character handling. # some author names have accents, # filename may have the accented or un-accented version of the character # The currently non-configurable value of fuzziness might need to go in config # We stored GoodReads unmodified author name in # author_gr, so store in LL db under that # fuzz.ratio doesn't lowercase for us match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower()) if match_fuzz < 90: logger.debug("Failed to match author [%s] to authorname [%s] fuzz [%d]" % (author, match_name, match_fuzz)) # To save loading hundreds of books by unknown authors at GR or GB, ignore unknown if (author != "Unknown") and (match_fuzz >= 90): # use "intact" name for author that we stored in # GR author_dict, not one of the various mangled versions # otherwise the books appear to be by a different author! author = author_gr['authorname'] authorid = author_gr['authorid'] # this new authorname may already be in the # database, so check again check_exist_author = myDB.match('SELECT AuthorID FROM authors where AuthorID=?', (authorid,)) if check_exist_author: logger.debug('Found goodreads authorname %s in database' % author) else: logger.info("Adding new author [%s]" % author) try: addAuthorToDB(authorname=author, refresh=refresh, authorid=authorid, addbooks=addbooks) check_exist_author = myDB.match('SELECT AuthorID FROM authors where AuthorID=?', (authorid,)) if check_exist_author: new = True except Exception as e: logger.error('Failed to add author [%s] to db: %s %s' % (author, type(e).__name__, str(e))) # check author exists in db, either newly loaded or already there if not check_exist_author: logger.debug("Failed to match author [%s] in database" % author) return "", "", False author = makeUnicode(author) return author, check_exist_author['AuthorID'], new