def search_rss_book(books=None, reset=False): threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "SEARCHRSS" if not (lazylibrarian.USE_RSS()): logger.warn('RSS search is disabled') scheduleJob(action='Stop', target='search_rss_book') return myDB = database.DBConnection() searchlist = [] if books is None: # We are performing a backlog search searchbooks = myDB.select( 'SELECT BookID, AuthorName, Bookname, BookSub, BookAdded from books WHERE Status="Wanted" order by BookAdded desc' ) else: # The user has added a new book searchbooks = [] for book in books: searchbook = myDB.select( 'SELECT BookID, AuthorName, BookName, BookSub from books WHERE BookID="%s" \ AND Status="Wanted"' % book['bookid']) for terms in searchbook: searchbooks.append(terms) if len(searchbooks) == 0: return logger.info('RSS Searching for %i book%s' % (len(searchbooks), plural(len(searchbooks)))) resultlist, nproviders = IterateOverRSSSites() if not nproviders: logger.warn('No rss providers are set, check config') return # No point in continuing dic = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '' } rss_count = 0 for book in searchbooks: authorname, bookname = get_searchterm(book, "book") found = processResultList(resultlist, authorname, bookname, book, 'book') # if you can't find the book, try title without any "(extended details, series etc)" if not found: if '(' in bookname: # anything to shorten? authorname, bookname = get_searchterm(book, "shortbook") found = processResultList(resultlist, authorname, bookname, book, 'shortbook') if not found: logger.debug( "Searches returned no results. Adding book %s - %s to queue." % (authorname, bookname)) if found > True: rss_count = rss_count + 1 logger.info("RSS Search for Wanted items complete, found %s book%s" % (rss_count, plural(rss_count))) if reset: scheduleJob(action='Restart', target='search_rss_book')
def findBestResult(resultlist, book, searchtype, source): """ resultlist: collated results from search providers book: the book we want to find searchtype: book, magazine, shortbook, audiobook etc. source: nzb, tor, rss, direct return: highest scoring match, or None if no match """ # noinspection PyBroadException try: myDB = database.DBConnection() dictrepl = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' ' } dic = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '.', ';': '', '\'': '' } if source == 'rss': author, title = get_searchterm(book, searchtype) else: author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) if book['library'] == 'AudioBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_AUDIO']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXAUDIO'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINAUDIO'], 0) auxinfo = 'AudioBook' else: # elif book['library'] == 'eBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_WORDS']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXSIZE'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINSIZE'], 0) auxinfo = 'eBook' if source == 'nzb': prefix = 'nzb' else: # rss and libgen return same names as torrents prefix = 'tor_' logger.debug('Searching %s %s results for best %s match' % (len(resultlist), source, auxinfo)) matches = [] for res in resultlist: resultTitle = unaccented_str( replace_all(res[prefix + 'title'], dictrepl)).strip() resultTitle = re.sub(r"\s\s+", " ", resultTitle) # remove extra whitespace Author_match = fuzz.token_set_ratio(author, resultTitle) Book_match = fuzz.token_set_ratio(title, resultTitle) if lazylibrarian.LOGLEVEL & lazylibrarian.log_fuzz: logger.debug("%s author/book Match: %s/%s %s at %s" % (source.upper(), Author_match, Book_match, resultTitle, res[prefix + 'prov'])) rejected = False url = res[prefix + 'url'] if url is None: rejected = True logger.debug("Rejecting %s, no URL found" % resultTitle) if not rejected and lazylibrarian.CONFIG['BLACKLIST_FAILED']: already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (url, )) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_PROCESSED']: already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl=?', (url, )) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and not url.startswith( 'http') and not url.startswith('magnet'): rejected = True logger.debug("Rejecting %s, invalid URL [%s]" % (resultTitle, url)) if not rejected: for word in reject_list: if word in getList(resultTitle.lower()) and word not in getList(author.lower()) \ and word not in getList(title.lower()): rejected = True logger.debug("Rejecting %s, contains %s" % (resultTitle, word)) break size_temp = check_int( res[prefix + 'size'], 1000) # Need to cater for when this is NONE (Issue 35) size = round(float(size_temp) / 1048576, 2) if not rejected and maxsize and size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % resultTitle) if not rejected and minsize and size < minsize: rejected = True logger.debug("Rejecting %s, too small" % resultTitle) if not rejected: bookid = book['bookid'] # newTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() # newTitle = resultTitle + ' LL.(' + book['bookid'] + ')' if source == 'nzb': mode = res['nzbmode'] # nzb, torznab else: mode = res[ 'tor_type'] # torrent, magnet, nzb(from rss), direct controlValueDict = {"NZBurl": url} newValueDict = { "NZBprov": res[prefix + 'prov'], "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": size, "NZBtitle": resultTitle, "NZBmode": mode, "AuxInfo": auxinfo, "Status": "Skipped" } score = (Book_match + Author_match) / 2 # as a percentage # lose a point for each unwanted word in the title so we get the closest match # but for RSS ignore anything at the end in square braces [keywords, genres etc] if source == 'rss': wordlist = getList(resultTitle.rsplit('[', 1)[0].lower()) else: wordlist = getList(resultTitle.lower()) words = [ x for x in wordlist if x not in getList(author.lower()) ] words = [x for x in words if x not in getList(title.lower())] typelist = '' if newValueDict['AuxInfo'] == 'eBook': words = [ x for x in words if x not in getList(lazylibrarian.CONFIG['EBOOK_TYPE']) ] typelist = getList(lazylibrarian.CONFIG['EBOOK_TYPE']) elif newValueDict['AuxInfo'] == 'AudioBook': words = [ x for x in words if x not in getList( lazylibrarian.CONFIG['AUDIOBOOK_TYPE']) ] typelist = getList(lazylibrarian.CONFIG['AUDIOBOOK_TYPE']) score -= len(words) # prioritise titles that include the ebook types we want # add more points for booktypes nearer the left in the list # eg if epub, mobi, pdf add 3 points if epub found, 2 for mobi, 1 for pdf booktypes = [x for x in wordlist if x in typelist] if booktypes: typelist = list(reversed(typelist)) for item in booktypes: for i in [ i for i, x in enumerate(typelist) if x == item ]: score += i + 1 matches.append( [score, newValueDict, controlValueDict, res['priority']]) if matches: highest = max(matches, key=lambda s: (s[0], s[3])) score = highest[0] newValueDict = highest[1] # controlValueDict = highest[2] dlpriority = highest[3] if score < int(lazylibrarian.CONFIG['MATCH_RATIO']): logger.info( 'Nearest match (%s%%): %s using %s search for %s %s' % (score, newValueDict['NZBtitle'], searchtype, book['authorName'], book['bookName'])) else: logger.info( 'Best match (%s%%): %s using %s search, %s priority %s' % (score, newValueDict['NZBtitle'], searchtype, newValueDict['NZBprov'], dlpriority)) return highest else: logger.debug("No %s found for [%s] using searchtype %s" % (source, book["searchterm"], searchtype)) return None except Exception: logger.error('Unhandled exception in findBestResult: %s' % traceback.format_exc())
def findBestResult(resultlist, book, searchtype, source): """ resultlist: collated results from search providers book: the book we want to find searchtype: book, magazine, shortbook, audiobook etc. source: nzb, tor, rss, direct return: highest scoring match, or None if no match """ # noinspection PyBroadException try: myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '.', ';': '', '\'': ''} if source == 'rss': author, title = get_searchterm(book, searchtype) else: author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) if book['library'] == 'AudioBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_AUDIO']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXAUDIO'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINAUDIO'], 0) auxinfo = 'AudioBook' else: # elif book['library'] == 'eBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_WORDS']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXSIZE'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINSIZE'], 0) auxinfo = 'eBook' if source == 'nzb': prefix = 'nzb' else: # rss and libgen return same names as torrents prefix = 'tor_' logger.debug('Searching %s %s results for best %s match' % (len(resultlist), source, auxinfo)) matches = [] for res in resultlist: resultTitle = unaccented_str(replace_all(res[prefix + 'title'], dictrepl)).strip() resultTitle = re.sub(r"\s\s+", " ", resultTitle) # remove extra whitespace Author_match = fuzz.token_set_ratio(author, resultTitle) Book_match = fuzz.token_set_ratio(title, resultTitle) if lazylibrarian.LOGLEVEL & lazylibrarian.log_fuzz: logger.debug("%s author/book Match: %s/%s %s at %s" % (source.upper(), Author_match, Book_match, resultTitle, res[prefix + 'prov'])) rejected = False url = res[prefix + 'url'] if url is None: rejected = True logger.debug("Rejecting %s, no URL found" % resultTitle) if not rejected and lazylibrarian.CONFIG['BLACKLIST_FAILED']: already_failed = myDB.match('SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (url,)) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_PROCESSED']: already_failed = myDB.match('SELECT * from wanted WHERE NZBurl=?', (url,)) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and not url.startswith('http') and not url.startswith('magnet'): rejected = True logger.debug("Rejecting %s, invalid URL [%s]" % (resultTitle, url)) if not rejected: for word in reject_list: if word in getList(resultTitle.lower()) and word not in getList(author.lower()) \ and word not in getList(title.lower()): rejected = True logger.debug("Rejecting %s, contains %s" % (resultTitle, word)) break size_temp = check_int(res[prefix + 'size'], 1000) # Need to cater for when this is NONE (Issue 35) size = round(float(size_temp) / 1048576, 2) if not rejected and maxsize and size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % resultTitle) if not rejected and minsize and size < minsize: rejected = True logger.debug("Rejecting %s, too small" % resultTitle) if not rejected: bookid = book['bookid'] # newTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() # newTitle = resultTitle + ' LL.(' + book['bookid'] + ')' if source == 'nzb': mode = res['nzbmode'] # nzb, torznab else: mode = res['tor_type'] # torrent, magnet, nzb(from rss), direct controlValueDict = {"NZBurl": url} newValueDict = { "NZBprov": res[prefix + 'prov'], "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": size, "NZBtitle": resultTitle, "NZBmode": mode, "AuxInfo": auxinfo, "Status": "Skipped" } score = (Book_match + Author_match) / 2 # as a percentage # lose a point for each unwanted word in the title so we get the closest match # but for RSS ignore anything at the end in square braces [keywords, genres etc] if source == 'rss': wordlist = getList(resultTitle.rsplit('[', 1)[0].lower()) else: wordlist = getList(resultTitle.lower()) words = [x for x in wordlist if x not in getList(author.lower())] words = [x for x in words if x not in getList(title.lower())] typelist = '' if newValueDict['AuxInfo'] == 'eBook': words = [x for x in words if x not in getList(lazylibrarian.CONFIG['EBOOK_TYPE'])] typelist = getList(lazylibrarian.CONFIG['EBOOK_TYPE']) elif newValueDict['AuxInfo'] == 'AudioBook': words = [x for x in words if x not in getList(lazylibrarian.CONFIG['AUDIOBOOK_TYPE'])] typelist = getList(lazylibrarian.CONFIG['AUDIOBOOK_TYPE']) score -= len(words) # prioritise titles that include the ebook types we want # add more points for booktypes nearer the left in the list # eg if epub, mobi, pdf add 3 points if epub found, 2 for mobi, 1 for pdf booktypes = [x for x in wordlist if x in typelist] if booktypes: typelist = list(reversed(typelist)) for item in booktypes: for i in [i for i, x in enumerate(typelist) if x == item]: score += i + 1 matches.append([score, newValueDict, controlValueDict, res['priority']]) if matches: highest = max(matches, key=lambda s: (s[0], s[3])) score = highest[0] newValueDict = highest[1] # controlValueDict = highest[2] dlpriority = highest[3] if score < int(lazylibrarian.CONFIG['MATCH_RATIO']): logger.info('Nearest match (%s%%): %s using %s search for %s %s' % (score, newValueDict['NZBtitle'], searchtype, book['authorName'], book['bookName'])) else: logger.info('Best match (%s%%): %s using %s search, %s priority %s' % (score, newValueDict['NZBtitle'], searchtype, newValueDict['NZBprov'], dlpriority)) return highest else: logger.debug("No %s found for [%s] using searchtype %s" % (source, book["searchterm"], searchtype)) return None except Exception: logger.error('Unhandled exception in findBestResult: %s' % traceback.format_exc())
def search_rss_book(books=None, reset=False): try: threadname = threading.currentThread().name if "Thread-" in threadname: if books is None: threading.currentThread().name = "SEARCHALLRSS" else: threading.currentThread().name = "SEARCHRSS" if not (lazylibrarian.USE_RSS()): logger.warn('RSS search is disabled') scheduleJob(action='Stop', target='search_rss_book') return if not internet(): logger.warn('Search RSS Book: No internet connection') return myDB = database.DBConnection() resultlist, wishproviders = IterateOverGoodReads() if not wishproviders: logger.debug('No rss wishlists are set') else: # for each item in resultlist, add to database if necessary, and mark as wanted for book in resultlist: # we get rss_author, rss_title, rss_isbn, rss_bookid (goodreads bookid) # we can just use bookid if goodreads, or try isbn and name matching on author/title if googlebooks # not sure if anyone would use a goodreads wishlist if not using goodreads interface... logger.debug('Processing %s item%s in wishlists' % (len(resultlist), plural(len(resultlist)))) if book['rss_bookid'] and lazylibrarian.CONFIG[ 'BOOK_API'] == "GoodReads": bookmatch = myDB.match( 'select Status,BookName from books where bookid="%s"' % book['rss_bookid']) if bookmatch: bookstatus = bookmatch['Status'] bookname = bookmatch['BookName'] if bookstatus in ['Open', 'Wanted', 'Have']: logger.info( u'Found book %s, already marked as "%s"' % (bookname, bookstatus)) else: # skipped/ignored logger.info(u'Found book %s, marking as "Wanted"' % bookname) controlValueDict = {"BookID": bookid} newValueDict = {"Status": "Wanted"} myDB.upsert("books", newValueDict, controlValueDict) else: import_book(book['rss_bookid']) else: item = {} headers = [] item['Title'] = book['rss_title'] if book['rss_bookid']: item['BookID'] = book['rss_bookid'] headers.append('BookID') if book['rss_isbn']: item['ISBN'] = book['rss_isbn'] headers.append('ISBN') bookmatch = finditem(item, book['rss_author'], headers) if bookmatch: # it's already in the database authorname = bookmatch['AuthorName'] bookname = bookmatch['BookName'] bookid = bookmatch['BookID'] bookstatus = bookmatch['Status'] if bookstatus in ['Open', 'Wanted', 'Have']: logger.info( u'Found book %s by %s, already marked as "%s"' % (bookname, authorname, bookstatus)) else: # skipped/ignored logger.info( u'Found book %s by %s, marking as "Wanted"' % (bookname, authorname)) controlValueDict = {"BookID": bookid} newValueDict = {"Status": "Wanted"} myDB.upsert("books", newValueDict, controlValueDict) else: # not in database yet results = '' if book['rss_isbn']: results = search_for(book['rss_isbn']) if results: result = results[0] if result['isbn_fuzz'] > lazylibrarian.CONFIG[ 'MATCH_RATIO']: logger.info( "Found (%s%%) %s: %s" % (result['isbn_fuzz'], result['authorname'], result['bookname'])) import_book(result['bookid']) bookmatch = True if not results: searchterm = "%s <ll> %s" % ( item['Title'], formatAuthorName(book['rss_author'])) results = search_for(unaccented(searchterm)) if results: result = results[0] if result['author_fuzz'] > lazylibrarian.CONFIG['MATCH_RATIO'] \ and result['book_fuzz'] > lazylibrarian.CONFIG['MATCH_RATIO']: logger.info( "Found (%s%% %s%%) %s: %s" % (result['author_fuzz'], result['book_fuzz'], result['authorname'], result['bookname'])) import_book(result['bookid']) bookmatch = True if not bookmatch: msg = "Skipping book %s by %s" % (item['Title'], book['rss_author']) # noinspection PyUnboundLocalVariable if not results: msg += ', No results returned' logger.warn(msg) else: msg += ', No match found' logger.warn(msg) msg = "Closest match (%s%% %s%%) %s: %s" % ( result['author_fuzz'], result['book_fuzz'], result['authorname'], result['bookname']) logger.warn(msg) if books is None: # We are performing a backlog search cmd = 'SELECT BookID, AuthorName, Bookname, BookSub, BookAdded from books,authors ' cmd += 'WHERE books.AuthorID = authors.AuthorID and books.Status="Wanted" order by BookAdded desc' searchbooks = myDB.select(cmd) else: # The user has added a new book searchbooks = [] for book in books: cmd = 'SELECT BookID, AuthorName, BookName, BookSub from books,authors ' cmd += 'WHERE books.AuthorID = authors.AuthorID and BookID="%s" ' % book[ 'bookid'] cmd += 'AND books.Status="Wanted"' searchbook = myDB.select(cmd) for terms in searchbook: searchbooks.append(terms) if len(searchbooks) == 0: return resultlist, nproviders = IterateOverRSSSites() if not nproviders: if not wishproviders: logger.warn('No rss providers are set, check config') return # No point in continuing logger.info('RSS Searching for %i book%s' % (len(searchbooks), plural(len(searchbooks)))) rss_count = 0 for book in searchbooks: authorname, bookname = get_searchterm(book, "book") found = processResultList(resultlist, authorname, bookname, book, 'book') # if you can't find the book, try title without any "(extended details, series etc)" if not found and '(' in bookname: # anything to shorten? authorname, bookname = get_searchterm(book, "shortbook") found = processResultList(resultlist, authorname, bookname, book, 'shortbook') if not found: logger.debug( "Searches returned no results. Adding book %s - %s to queue." % (authorname, bookname)) if found > True: rss_count += 1 logger.info("RSS Search for Wanted items complete, found %s book%s" % (rss_count, plural(rss_count))) if reset: scheduleJob(action='Restart', target='search_rss_book') except Exception: logger.error('Unhandled exception in search_rss_book: %s' % traceback.format_exc())
def search_rss_book(books=None, reset=False): try: threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "SEARCHRSS" if not(lazylibrarian.USE_RSS()): logger.warn('RSS search is disabled') scheduleJob(action='Stop', target='search_rss_book') return myDB = database.DBConnection() searchlist = [] if books is None: # We are performing a backlog search searchbooks = myDB.select( 'SELECT BookID, AuthorName, Bookname, BookSub, BookAdded from books WHERE Status="Wanted" \ order by BookAdded desc') else: # The user has added a new book searchbooks = [] for book in books: searchbook = myDB.select('SELECT BookID, AuthorName, BookName, BookSub from books WHERE BookID="%s" \ AND Status="Wanted"' % book['bookid']) for terms in searchbook: searchbooks.append(terms) if len(searchbooks) == 0: return logger.info('RSS Searching for %i book%s' % (len(searchbooks), plural(len(searchbooks)))) resultlist, nproviders = IterateOverRSSSites() if not nproviders: logger.warn('No rss providers are set, check config') return # No point in continuing dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} rss_count = 0 for book in searchbooks: authorname, bookname = get_searchterm(book, "book") found = processResultList(resultlist, authorname, bookname, book, 'book') # if you can't find the book, try title without any "(extended details, series etc)" if not found and '(' in bookname: # anything to shorten? authorname, bookname = get_searchterm(book, "shortbook") found = processResultList(resultlist, authorname, bookname, book, 'shortbook') if not found: logger.debug("Searches returned no results. Adding book %s - %s to queue." % (authorname, bookname)) if found > True: rss_count = rss_count + 1 logger.info("RSS Search for Wanted items complete, found %s book%s" % (rss_count, plural(rss_count))) if reset: scheduleJob(action='Restart', target='search_rss_book') except Exception as e: logger.error('Unhandled exception in search_rss_book: %s' % traceback.format_exc())