def get_book_desc(isbn=None, author=None, title=None): """ GoodReads does not always have a book description in its api results due to restrictive TOS from some of its providers. Try to get missing descriptions from googlebooks Return description, empty string if not found, None if error""" if not author or not title: return '' author = cleanName(author) title = cleanName(title) if lazylibrarian.CONFIG['BOOK_API'] == 'GoodReads': baseurl = 'https://www.googleapis.com/books/v1/volumes?q=' urls = [ baseurl + quote_plus('inauthor:%s intitle:%s' % (author, title)) ] if isbn: urls.insert(0, baseurl + quote_plus('isbn:' + isbn)) for url in urls: if lazylibrarian.CONFIG['GB_API']: url += '&key=' + lazylibrarian.CONFIG['GB_API'] if lazylibrarian.CONFIG['GB_COUNTRY'] and len( lazylibrarian.CONFIG['GB_COUNTRY'] == 2): url += '&country=' + lazylibrarian.CONFIG['GB_COUNTRY'] results, cached = gb_json_request(url) if results is None: # there was an error return None if results and not cached: time.sleep(1) if results and 'items' in results: for item in results['items']: # noinspection PyBroadException try: auth = item['volumeInfo']['authors'][0] book = item['volumeInfo']['title'] desc = item['volumeInfo']['description'] book_fuzz = fuzz.token_set_ratio(book, title) auth_fuzz = fuzz.token_set_ratio(auth, author) if book_fuzz > 98 and auth_fuzz > 80: return desc except Exception: pass return ''
def get_book_desc(isbn=None, author=None, title=None): """ GoodReads does not always have a book description in its api results due to restrictive TOS from some of its providers. Try to get missing descriptions from googlebooks Return description, empty string if not found, None if error""" if not author or not title: return '' author = cleanName(author) title = cleanName(title) if lazylibrarian.CONFIG['BOOK_API'] == 'GoodReads': baseurl = 'https://www.googleapis.com/books/v1/volumes?q=' urls = [baseurl + quote_plus('inauthor:%s intitle:%s' % (author, title))] if isbn: urls.insert(0, baseurl + quote_plus('isbn:' + isbn)) for url in urls: if lazylibrarian.CONFIG['GB_API']: url += '&key=' + lazylibrarian.CONFIG['GB_API'] if lazylibrarian.CONFIG['GB_COUNTRY'] and len(lazylibrarian.CONFIG['GB_COUNTRY'] == 2): url += '&country=' + lazylibrarian.CONFIG['GB_COUNTRY'] results, cached = gb_json_request(url) if results is None: # there was an error return None if results and not cached: time.sleep(1) if results and 'items' in results: for item in results['items']: # noinspection PyBroadException try: auth = item['volumeInfo']['authors'][0] book = item['volumeInfo']['title'] desc = item['volumeInfo']['description'] book_fuzz = fuzz.token_set_ratio(book, title) auth_fuzz = fuzz.token_set_ratio(auth, author) if book_fuzz > 98 and auth_fuzz > 80: return desc except Exception: pass return ''
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': '', '\s\s': ' ', ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} for tor in resultlist: tor_Title = formatter.latinToAscii(formatter.replace_all(str(tor['tor_title']), dictrepl)).strip() tor_Title = re.sub(r"\s\s+", " ", tor_Title) # remove extra whitespace match_ratio = int(lazylibrarian.MATCH_RATIO) tor_Title_match = fuzz.token_set_ratio(book['searchterm'], tor_Title) logger.debug("Torrent Title Match %: " + str(tor_Title_match) + " for " + tor_Title) if (tor_Title_match > match_ratio): logger.debug(u'Found Torrent: %s using %s search' % (tor['tor_title'], searchtype)) bookid = book['bookid'] tor_Title = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() tor_url = tor['tor_url'] tor_prov = tor['tor_prov'] tor_size_temp = tor['tor_size'] # Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = str(round(float(tor_size_temp) / 1048576, 2)) + ' MB' controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone() if not snatchedbooks: snatch = TORDownloadMethod(bookid, tor_prov, tor_Title, tor_url) if snatch: notifiers.notify_snatch(formatter.latinToAscii(tor_Title) + ' at ' + formatter.now()) postprocess.schedule_processor(action='Start') return True logger.debug("No torrent's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + " using searchtype " + searchtype) return False
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": " ", "*": "", "(": "", ")": "", "[": "", "]": "", "#": "", "0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "", "'": "", ":": "", "!": "", "-": " ", "\s\s": " ", } # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": "", ":": "", ";": "", "'": "", } match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) author = formatter.latinToAscii(formatter.replace_all(book["authorName"], dic)) title = formatter.latinToAscii(formatter.replace_all(book["bookName"], dic)) matches = [] for nzb in resultlist: nzb_Title = formatter.latinToAscii(formatter.replace_all(nzb["nzbtitle"], dictrepl)).strip() nzb_Title = re.sub(r"\s\s+", " ", nzb_Title) # remove extra whitespace nzbAuthor_match = fuzz.token_set_ratio(author, nzb_Title) nzbBook_match = fuzz.token_set_ratio(title, nzb_Title) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzb_Title)) rejected = False for word in reject_list: if word in nzb_Title.lower() and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzb_Title, word)) break nzbsize_temp = nzb["nzbsize"] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) maxsize = formatter.check_int(lazylibrarian.REJECT_MAXSIZE, 0) if maxsize and nzbsize > maxsize: rejected = True logger.debug("Rejecting %s, too large" % nzb_Title) if nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio and not rejected: # logger.debug(u'Found NZB: %s using %s search' % (nzb['nzbtitle'], searchtype)) bookid = book["bookid"] nzbTitle = (author + " - " + title + " LL.(" + book["bookid"] + ")").strip() nzburl = nzb["nzburl"] nzbprov = nzb["nzbprov"] nzbdate_temp = nzb["nzbdate"] nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb["nzbmode"] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped", } score = (nzbBook_match + nzbAuthor_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(formatter.getList(nzb_Title)) words -= len(formatter.getList(author)) words -= len(formatter.getList(title)) score -= abs(words) matches.append([score, nzb_Title, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] logger.info(u"Best match NZB (%s%%): %s using %s search" % (score, nzb_Title, searchtype)) myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"] ).fetchone() if not snatchedbooks: if nzbmode == "torznab": snatch = TORDownloadMethod( newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"], ) else: snatch = NZBDownloadMethod( newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"], ) if snatch: notifiers.notify_snatch(newValueDict["NZBtitle"] + " at " + formatter.now()) common.schedule_job(action="Start", target="processDir") return True logger.debug( "No nzb's found for " + (book["authorName"] + " " + book["bookName"]).strip() + " using searchtype " + searchtype ) return False
def find_results(self, searchterm=None, queue=None): try: resultlist = [] api_hits = 0 searchtitle = '' searchauthorname = '' if ' <ll> ' in searchterm: # special token separates title from author searchtitle, searchauthorname = searchterm.split(' <ll> ') searchterm = searchterm.replace(' <ll> ', ' ') searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) url = urllib.quote_plus(searchterm) set_url = 'https://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params) logger.debug('Now searching GoodReads API with searchterm: %s' % searchterm) # logger.debug('Searching for %s at: %s' % (searchterm, set_url)) resultcount = 0 try: try: rootxml, in_cache = get_xml_request(set_url) except Exception as e: logger.error("%s finding gr results: %s" % (type(e).__name__, str(e))) return if rootxml is None: logger.debug("Error requesting results") return totalresults = check_int(rootxml.find('search/total-results').text, 0) resultxml = rootxml.getiterator('work') loopCount = 1 while resultxml: for author in resultxml: try: if author.find('original_publication_year').text is None: bookdate = "0000" else: bookdate = author.find('original_publication_year').text except (KeyError, AttributeError): bookdate = "0000" try: authorNameResult = author.find('./best_book/author/name').text # Goodreads sometimes puts extra whitepase in the author names! authorNameResult = ' '.join(authorNameResult.split()) except (KeyError, AttributeError): authorNameResult = "" booksub = "" bookpub = "" booklang = "Unknown" try: bookimg = author.find('./best_book/image_url').text if bookimg == 'https://www.goodreads.com/assets/nocover/111x148.png': bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' try: bookrate = author.find('average_rating').text except KeyError: bookrate = 0 bookpages = '0' bookgenre = '' bookdesc = '' bookisbn = '' try: booklink = 'https://www.goodreads.com/book/show/' + author.find('./best_book/id').text except (KeyError, AttributeError): booklink = "" try: authorid = author.find('./best_book/author/id').text except (KeyError, AttributeError): authorid = "" try: if author.find('./best_book/title').text is None: bookTitle = "" else: bookTitle = author.find('./best_book/title').text except (KeyError, AttributeError): bookTitle = "" if searchauthorname: author_fuzz = fuzz.ratio(authorNameResult, searchauthorname) else: author_fuzz = fuzz.ratio(authorNameResult, searchterm) if searchtitle: book_fuzz = fuzz.token_set_ratio(bookTitle, searchtitle) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(bookTitle)) words -= len(getList(searchtitle)) book_fuzz -= abs(words) else: book_fuzz = fuzz.token_set_ratio(bookTitle, searchterm) words = len(getList(bookTitle)) words -= len(getList(searchterm)) book_fuzz -= abs(words) isbn_fuzz = 0 if is_valid_isbn(searchterm): isbn_fuzz = 100 highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz) try: bookid = author.find('./best_book/id').text except (KeyError, AttributeError): bookid = "" resultlist.append({ 'authorname': authorNameResult, 'bookid': bookid, 'authorid': authorid, 'bookname': bookTitle, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': booklink, 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': float(bookrate) }) resultcount += 1 loopCount += 1 if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < loopCount: resultxml = None logger.warn('Maximum results page search reached, still more results available') elif totalresults and resultcount >= totalresults: # fix for goodreads bug on isbn searches resultxml = None else: URL = set_url + '&page=' + str(loopCount) resultxml = None try: rootxml, in_cache = get_xml_request(URL) if rootxml is None: logger.debug('Error requesting page %s of results' % loopCount) else: resultxml = rootxml.getiterator('work') if not in_cache: api_hits += 1 except Exception as e: resultxml = None logger.error("%s finding page %s of results: %s" % (type(e).__name__, loopCount, str(e))) if resultxml: if all(False for _ in resultxml): # returns True if iterator is empty resultxml = None except Exception as err: if err.code == 404: logger.error('Received a 404 error when searching for author') if err.code == 403: logger.warn('Access to api is denied: usage exceeded') else: logger.error('An unexpected error has occurred when searching for an author: %s' % str(err)) logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), searchterm)) logger.debug( 'The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), searchterm)) queue.put(resultlist) except Exception: logger.error('Unhandled exception in GR.find_results: %s' % traceback.format_exc())
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = getList(lazylibrarian.REJECT_WORDS) author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) matches = [] for tor in resultlist: torTitle = unaccented_str(tor['tor_title']) torTitle = replace_all(torTitle, dictrepl).strip() torTitle = re.sub(r"\s\s+", " ", torTitle) # remove extra whitespace torAuthor_match = fuzz.token_set_ratio(author, torTitle) torBook_match = fuzz.token_set_ratio(title, torTitle) logger.debug(u"TOR author/book Match: %s/%s for %s" % (torAuthor_match, torBook_match, torTitle)) tor_url = tor['tor_url'] rejected = False already_failed = myDB.match('SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % tor_url) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (torTitle, already_failed['NZBprov'])) rejected = True if not rejected: for word in reject_list: if word in torTitle.lower() and word not in author.lower() and word not in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (torTitle, word)) break tor_size_temp = tor['tor_size'] # Need to cater for when this is NONE (Issue 35) tor_size_temp = check_int(tor_size_temp, 1000) tor_size = round(float(tor_size_temp) / 1048576, 2) maxsize = check_int(lazylibrarian.REJECT_MAXSIZE, 0) if not rejected: if maxsize and tor_size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % torTitle) if not rejected: bookid = book['bookid'] tor_Title = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor['tor_prov'], "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } score = (torBook_match + torAuthor_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(getList(torTitle)) words -= len(getList(author)) words -= len(getList(title)) score -= abs(words) matches.append([score, torTitle, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] if score < match_ratio: logger.info(u'Nearest TOR match (%s%%): %s using %s search for %s %s' % (score, nzb_Title, searchtype, author, title)) return False logger.info(u'Best TOR match (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) snatchedbooks = myDB.match('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]) if snatchedbooks: logger.debug('%s already marked snatched' % nzb_Title) return True # someone else found it, not us else: myDB.upsert("wanted", newValueDict, controlValueDict) if newValueDict["NZBprov"] == 'libgen': # for libgen we use direct download links snatch = DirectDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"], nzb_Title) else: snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) if snatch: logger.info('Downloading %s from %s' % (newValueDict["NZBtitle"], newValueDict["NZBprov"])) notify_snatch("%s from %s at %s" % (newValueDict["NZBtitle"], newValueDict["NZBprov"], now())) scheduleJob(action='Start', target='processDir') return True + True # we found it else: logger.debug("No torrent's found for [%s] using searchtype %s" % (book["searchterm"], searchtype)) return False
def find_results(self, authorname=None, queue=None): myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(authorname): api_strings = ['isbn:'] api_hits = 0 logger.debug( 'Now searching Google Books API with keyword: ' + self.name) for api_value in api_strings: startindex = 0 if api_value == "isbn:": set_url = self.url + urllib.quote(api_value + self.name.encode(lazylibrarian.SYS_ENCODING)) else: set_url = self.url + \ urllib.quote(api_value + '"' + self.name.encode(lazylibrarian.SYS_ENCODING) + '"') try: startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = get_json_request(URL) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits = api_hits + 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn( 'Found no results for %s with value: %s' % (api_value, self.name)) break else: pass except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break startindex = startindex + 40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug( 'Skipped a result without authorfield.') no_author_count = no_author_count + 1 continue valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = item['volumeInfo']['language'] if booklang not in valid_langs: logger.debug( 'Skipped a book with language %s' % booklang) ignored = ignored + 1 continue except KeyError: ignored = ignored + 1 logger.debug( 'Skipped a result where no language is found') continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' bookdate = bookdate[:4] try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: num_reviews = item['volumeInfo']['ratingsCount'] except KeyError: num_reviews = 0 try: if item['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 author_fuzz = fuzz.token_set_ratio(Author, authorname) book_fuzz = fuzz.token_set_ratio( item['volumeInfo']['title'], authorname) isbn_fuzz = 0 if is_valid_isbn(authorname): isbn_fuzz = 100 highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) bookname = item['volumeInfo']['title'] dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace bookid = item['id'] author = myDB.select( 'SELECT AuthorID FROM authors WHERE AuthorName = "%s"' % Author.replace('"', '""')) if author: AuthorID = author[0]['authorid'] else: AuthorID = '' resultlist.append({ 'authorname': Author, 'authorid': AuthorID, 'bookid': bookid, 'bookname': bookname, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': num_reviews }) resultcount = resultcount + 1 except KeyError: break logger.debug("Found %s total result%s" % (total_count, plural(total_count))) logger.debug("Removed %s bad language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug( "Showing %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, authorname)) logger.debug( 'The Google Books API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), self.name)) queue.put(resultlist)
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: # backlog search searchmags = myDB.select('SELECT Title, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 1: logger.info('Searching for one magazine') else: logger.info('Searching for %i magazines' % len(searchmags)) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] # frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 to_snatch = 0 maglist = [] issues = [] reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = (u'%s' % nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] # frequency = results['Frequency'] # regex = results['Regex'] nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace( '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # keyword_check = nzbtitle_formatted.replace(bookid, '') # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( common.remove_accents(bookid), common.remove_accents(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug( u"Magazine token set Match failed: " + str( mag_title_match) + "% for " + nzbtitle_formatted) name_match = 0 lower_title = common.remove_accents(nzbtitle_formatted).lower() lower_bookid = common.remove_accents(bookid).lower() for word in reject_list: if word in lower_title and not word in lower_bookid: name_match = 0 logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if name_match: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf': nzbtitle_exploded.pop() # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp)) if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'fail' # force date failure # if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].rstrip(',').zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY # else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY or MonthName DD, YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp)) regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].rstrip(',').zfill(2) if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day)) except: logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: continue # store all the _new_ matching results, marking as "skipped" for now # we change the status to "wanted" on the ones we want to snatch later # don't add a new entry if this issue has been found on an earlier search # because status might have been user-set mag_entry = myDB.select('SELECT * from wanted WHERE NZBtitle="%s" and NZBprov="%s"' % (nzbtitle, nzbprov)) if not mag_entry: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": "Skipped", "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert("wanted", newValueDict, controlValueDict) if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: # Should probably only upsert when downloaded and processed in case snatch fails # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) to_snatch = to_snatch + 1 issues.append(issue) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBdate": formatter.now(), # when we asked for it "Status": "Wanted" } myDB.upsert("wanted", newValueDict, controlValueDict) else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug('Magazine [%s] does not completely match search term [%s].' % ( nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 logger.info('Found %i results for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % ( total_nzbs, bookid, new_date, old_date, bad_date, bad_regex, to_snatch)) for items in maglist: if items['nzbmode'] == "torznab": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) elif items['nzbmode'] == "torrent": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) else: snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) if snatch: notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') maglist = [] if reset: common.schedule_job(action='Restart', target='search_magazines') logger.info("Search for magazines complete")
def searchItem(item=None, bookid=None): """ Call all active search providers asking for a "general" search for item return a list of results, each entry in list containing percentage_match, title, provider, size, url """ results = [] if not item: return results if not internet(): logger.debug('Search Item: No internet connection') return results book = {} searchterm = unaccented_str(item) book['searchterm'] = searchterm if bookid: book['bookid'] = bookid else: book['bookid'] = searchterm nproviders = lazylibrarian.USE_NZB() + lazylibrarian.USE_TOR( ) + lazylibrarian.USE_RSS() logger.debug('Searching %s providers for %s' % (nproviders, searchterm)) if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'general') if nproviders: results += resultlist if lazylibrarian.USE_TOR(): resultlist, nproviders = IterateOverTorrentSites(book, 'general') if nproviders: results += resultlist if lazylibrarian.USE_RSS(): resultlist, nproviders = IterateOverRSSSites() if nproviders: results += resultlist # reprocess to get consistent results searchresults = [] for item in results: provider = '' title = '' url = '' size = '' date = '' mode = '' if 'nzbtitle' in item: title = item['nzbtitle'] if 'nzburl' in item: url = item['nzburl'] if 'nzbprov' in item: provider = item['nzbprov'] if 'nzbsize' in item: size = item['nzbsize'] if 'nzbdate' in item: date = item['nzbdate'] if 'nzbmode' in item: mode = item['nzbmode'] if 'tor_title' in item: title = item['tor_title'] if 'tor_url' in item: url = item['tor_url'] if 'tor_prov' in item: provider = item['tor_prov'] if 'tor_size' in item: size = item['tor_size'] if 'tor_date' in item: date = item['tor_date'] if 'tor_type' in item: mode = item['tor_type'] if title and provider and mode and url: # Not all results have a date or a size if not date: date = 'Fri, 01 Jan 1970 00:00:00 +0100' if not size: size = '1000' # calculate match percentage score = fuzz.token_set_ratio(searchterm, title) # lose a point for each extra word in the title so we get the closest match words = len(getList(searchterm)) words -= len(getList(title)) score -= abs(words) if score >= 40: # ignore wildly wrong results? url = url.split('?')[0] result = { 'score': score, 'title': title, 'provider': provider, 'size': size, 'date': date, 'url': urllib.quote_plus(url), 'mode': mode } searchresults.append(result) # from operator import itemgetter # searchresults = sorted(searchresults, key=itemgetter('score'), reverse=True) logger.debug('Found %s results for %s' % (len(searchresults), searchterm)) return searchresults
def findBestResult(resultlist, book, searchtype, source): """ resultlist: collated results from search providers book: the book we want to find searchtype: book, magazine, shortbook, audiobook etc. source: nzb, tor, rss, direct return: highest scoring match, or None if no match """ # noinspection PyBroadException try: myDB = database.DBConnection() dictrepl = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' ' } dic = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '.', ';': '', '\'': '' } if source == 'rss': author, title = get_searchterm(book, searchtype) else: author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) if book['library'] == 'AudioBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_AUDIO']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXAUDIO'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINAUDIO'], 0) auxinfo = 'AudioBook' else: # elif book['library'] == 'eBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_WORDS']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXSIZE'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINSIZE'], 0) auxinfo = 'eBook' if source == 'nzb': prefix = 'nzb' else: # rss and libgen return same names as torrents prefix = 'tor_' logger.debug('Searching %s %s results for best %s match' % (len(resultlist), source, auxinfo)) matches = [] for res in resultlist: resultTitle = unaccented_str( replace_all(res[prefix + 'title'], dictrepl)).strip() resultTitle = re.sub(r"\s\s+", " ", resultTitle) # remove extra whitespace Author_match = fuzz.token_set_ratio(author, resultTitle) Book_match = fuzz.token_set_ratio(title, resultTitle) if lazylibrarian.LOGLEVEL & lazylibrarian.log_fuzz: logger.debug("%s author/book Match: %s/%s %s at %s" % (source.upper(), Author_match, Book_match, resultTitle, res[prefix + 'prov'])) rejected = False url = res[prefix + 'url'] if url is None: rejected = True logger.debug("Rejecting %s, no URL found" % resultTitle) if not rejected and lazylibrarian.CONFIG['BLACKLIST_FAILED']: already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (url, )) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_PROCESSED']: already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl=?', (url, )) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and not url.startswith( 'http') and not url.startswith('magnet'): rejected = True logger.debug("Rejecting %s, invalid URL [%s]" % (resultTitle, url)) if not rejected: for word in reject_list: if word in getList(resultTitle.lower()) and word not in getList(author.lower()) \ and word not in getList(title.lower()): rejected = True logger.debug("Rejecting %s, contains %s" % (resultTitle, word)) break size_temp = check_int( res[prefix + 'size'], 1000) # Need to cater for when this is NONE (Issue 35) size = round(float(size_temp) / 1048576, 2) if not rejected and maxsize and size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % resultTitle) if not rejected and minsize and size < minsize: rejected = True logger.debug("Rejecting %s, too small" % resultTitle) if not rejected: bookid = book['bookid'] # newTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() # newTitle = resultTitle + ' LL.(' + book['bookid'] + ')' if source == 'nzb': mode = res['nzbmode'] # nzb, torznab else: mode = res[ 'tor_type'] # torrent, magnet, nzb(from rss), direct controlValueDict = {"NZBurl": url} newValueDict = { "NZBprov": res[prefix + 'prov'], "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": size, "NZBtitle": resultTitle, "NZBmode": mode, "AuxInfo": auxinfo, "Status": "Skipped" } score = (Book_match + Author_match) / 2 # as a percentage # lose a point for each unwanted word in the title so we get the closest match # but for RSS ignore anything at the end in square braces [keywords, genres etc] if source == 'rss': wordlist = getList(resultTitle.rsplit('[', 1)[0].lower()) else: wordlist = getList(resultTitle.lower()) words = [ x for x in wordlist if x not in getList(author.lower()) ] words = [x for x in words if x not in getList(title.lower())] typelist = '' if newValueDict['AuxInfo'] == 'eBook': words = [ x for x in words if x not in getList(lazylibrarian.CONFIG['EBOOK_TYPE']) ] typelist = getList(lazylibrarian.CONFIG['EBOOK_TYPE']) elif newValueDict['AuxInfo'] == 'AudioBook': words = [ x for x in words if x not in getList( lazylibrarian.CONFIG['AUDIOBOOK_TYPE']) ] typelist = getList(lazylibrarian.CONFIG['AUDIOBOOK_TYPE']) score -= len(words) # prioritise titles that include the ebook types we want # add more points for booktypes nearer the left in the list # eg if epub, mobi, pdf add 3 points if epub found, 2 for mobi, 1 for pdf booktypes = [x for x in wordlist if x in typelist] if booktypes: typelist = list(reversed(typelist)) for item in booktypes: for i in [ i for i, x in enumerate(typelist) if x == item ]: score += i + 1 matches.append( [score, newValueDict, controlValueDict, res['priority']]) if matches: highest = max(matches, key=lambda s: (s[0], s[3])) score = highest[0] newValueDict = highest[1] # controlValueDict = highest[2] dlpriority = highest[3] if score < int(lazylibrarian.CONFIG['MATCH_RATIO']): logger.info( 'Nearest match (%s%%): %s using %s search for %s %s' % (score, newValueDict['NZBtitle'], searchtype, book['authorName'], book['bookName'])) else: logger.info( 'Best match (%s%%): %s using %s search, %s priority %s' % (score, newValueDict['NZBtitle'], searchtype, newValueDict['NZBprov'], dlpriority)) return highest else: logger.debug("No %s found for [%s] using searchtype %s" % (source, book["searchterm"], searchtype)) return None except Exception: logger.error('Unhandled exception in findBestResult: %s' % traceback.format_exc())
def processDir(reset=False): threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "POSTPROCESS" if not lazylibrarian.DOWNLOAD_DIR or not os.path.isdir( lazylibrarian.DOWNLOAD_DIR): processpath = os.getcwd() else: processpath = lazylibrarian.DOWNLOAD_DIR logger.debug(' Checking [%s] for files to post process' % processpath) try: downloads = os.listdir(processpath) except OSError as why: logger.error('Could not access [%s] directory [%s]' % (processpath, why.strerror)) return myDB = database.DBConnection() snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if len(snatched) == 0: logger.info('Nothing marked as snatched.') scheduleJob(action='Stop', target='processDir') return if len(downloads) == 0: logger.info('No downloads are found. Nothing to process.') return logger.info("Checking %s download%s for %s snatched file%s" % (len(downloads), plural( len(downloads)), len(snatched), plural(len(snatched)))) ppcount = 0 for book in snatched: matches = [] for fname in downloads: if not fname.endswith('.fail'): # has this failed before? # this is to get round differences in torrent filenames. # Torrents aren't always returned with the name we searched for # there might be a better way... if isinstance(fname, str): matchname = fname.decode(lazylibrarian.SYS_ENCODING) else: matchname = fname if ' LL.(' in matchname: matchname = matchname.split(' LL.(')[0] matchtitle = book['NZBtitle'] match = 0 if matchtitle: if ' LL.(' in matchtitle: matchtitle = matchtitle.split(' LL.(')[0] match = fuzz.token_set_ratio(matchtitle, matchname) if match >= lazylibrarian.DLOAD_RATIO: fname = matchname if os.path.isfile(os.path.join(processpath, fname)): # not a directory, handle single file downloads here. Book/mag file in download root. # move the file into it's own subdirectory so we don't move/delete things that aren't ours if is_valid_booktype(fname, booktype="book") \ or is_valid_booktype(fname, booktype="mag"): fname = os.path.splitext(fname)[0] dirname = os.path.join(processpath, fname) if not os.path.exists(dirname): try: os.makedirs(dirname) except OSError as why: logger.debug( 'Failed to create directory %s, %s' % (dirname, why.strerror)) if os.path.exists(dirname): # move the book and any related files too # ie other book formats, or opf, jpg with same title # can't move metadata.opf or cover.jpg or similar # as can't be sure they are ours # not sure if we need a new listdir here, or whether we can use the old one list_dir = os.listdir(processpath) for ourfile in list_dir: if ourfile.startswith(fname): if is_valid_booktype(ourfile, booktype="book") \ or is_valid_booktype(ourfile, booktype="mag") \ or os.path.splitext(ourfile)[1].lower() in ['.opf', '.jpg']: try: shutil.move( os.path.join( processpath, ourfile), os.path.join( dirname, ourfile)) except Exception as why: logger.debug( "Failed to move file %s to %s, %s" % (ourfile, dirname, str(why))) if os.path.isdir(os.path.join(processpath, fname)): pp_path = os.path.join(processpath, fname) logger.debug('Found folder (%s%%) %s for %s' % (match, pp_path, book['NZBtitle'])) matches.append([match, pp_path, book]) else: logger.debug('No match (%s%%) %s for %s' % (match, matchname, matchtitle)) else: logger.debug('Skipping %s' % fname) if matches: highest = max(matches, key=lambda x: x[0]) match = highest[0] pp_path = highest[1] book = highest[2] logger.debug(u'Best match (%s%%): %s for %s' % (match, pp_path, book['NZBtitle'])) data = myDB.match('SELECT * from books WHERE BookID="%s"' % book['BookID']) if data: logger.debug(u'Processing book %s' % book['BookID']) authorname = data['AuthorName'] bookname = data['BookName'] if 'windows' in platform.system().lower( ) and '/' in lazylibrarian.EBOOK_DEST_FOLDER: logger.warn('Please check your EBOOK_DEST_FOLDER setting') lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace( '/', '\\') # Default destination path, should be allowed change per config file. dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace( '$Author', authorname).replace('$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace( '$Author', authorname).replace('$Title', bookname) global_name = unaccented(global_name) # dest_path = authorname+'/'+bookname # global_name = bookname + ' - ' + authorname # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = { '<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': '' } dest_path = unaccented_str(replace_all(dest_path, dic)) dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode( lazylibrarian.SYS_ENCODING) else: data = myDB.match('SELECT * from magazines WHERE Title="%s"' % book['BookID']) if data: logger.debug(u'Processing magazine %s' % book['BookID']) # AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple # files are downloading, there will be an error in post-processing, trying to go to the # same directory. mostrecentissue = data[ 'IssueDate'] # keep for processing issues arriving out of order # Remove characters we don't want in the filename before (maybe) adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = { '<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': '' } mag_name = unaccented_str(replace_all(book['BookID'], dic)) # book auxinfo is a cleaned date, eg 2015-01-01 dest_path = lazylibrarian.MAG_DEST_FOLDER.replace( '$IssueDate', book['AuxInfo']).replace('$Title', mag_name) # dest_path = '_Magazines/'+title+'/'+book['AuxInfo'] if lazylibrarian.MAG_RELATIVE: if dest_path[0] not in '._': dest_path = '_' + dest_path dest_path = os.path.join( lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING) else: dest_path = dest_path.encode( lazylibrarian.SYS_ENCODING) authorname = None bookname = None global_name = lazylibrarian.MAG_DEST_FILE.replace( '$IssueDate', book['AuxInfo']).replace('$Title', mag_name) global_name = unaccented(global_name) # global_name = book['AuxInfo']+' - '+title else: logger.debug( "Snatched magazine %s is not in download directory" % (book['BookID'])) continue else: logger.debug("Snatched %s %s is not in download directory" % (book['NZBmode'], book['NZBtitle'])) continue processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name, book['NZBmode']) if processBook: logger.debug("Processing %s, %s" % (global_name, book['NZBurl'])) # update nzbs, only update the snatched ones in case multiple matches for same book / magazine issue controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = { "Status": "Processed", "NZBDate": now() } # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) if bookname is not None: # it's a book, if None it's a magazine if len(lazylibrarian.IMP_CALIBREDB): logger.debug( 'Calibre should have created the extras for us') else: processExtras(myDB, dest_path, global_name, data) else: # update mags controlValueDict = {"Title": book['BookID']} if mostrecentissue: if mostrecentissue.isdigit() and str( book['AuxInfo']).isdigit(): older = int(mostrecentissue) > int( book['AuxInfo']) # issuenumber else: older = mostrecentissue > book['AuxInfo'] # YYYY-MM-DD else: older = False if older: # check this in case processing issues arriving out of order newValueDict = { "LastAcquired": today(), "IssueStatus": "Open" } else: newValueDict = { "IssueDate": book['AuxInfo'], "LastAcquired": today(), "IssueStatus": "Open" } myDB.upsert("magazines", newValueDict, controlValueDict) # dest_path is where we put the magazine after processing, but we don't have the full filename # so look for any "book" in that directory dest_file = book_file(dest_path, booktype='mag') controlValueDict = { "Title": book['BookID'], "IssueDate": book['AuxInfo'] } newValueDict = { "IssueAcquired": today(), "IssueFile": dest_file, "IssueID": create_id("%s %s" % (book['BookID'], book['AuxInfo'])) } myDB.upsert("issues", newValueDict, controlValueDict) # create a thumbnail cover for the new issue create_cover(dest_file) logger.info('Successfully processed: %s' % global_name) ppcount = ppcount + 1 notify_download("%s from %s at %s" % (global_name, book['NZBprov'], now())) else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s.fail' % pp_path) controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = {"Status": "Failed", "NZBDate": now()} myDB.upsert("wanted", newValueDict, controlValueDict) # if it's a book, reset status so we try for a different version # if it's a magazine, user can select a different one from pastissues table if bookname is not None: myDB.action( 'UPDATE books SET status = "Wanted" WHERE BookID="%s"' % book['BookID']) # at this point, as it failed we should move it or it will get postprocessed # again (and fail again) try: os.rename(pp_path, pp_path + '.fail') except Exception as e: logger.debug("Unable to rename %s, %s" % (pp_path, str(e))) downloads = os.listdir( processpath) # check in case we processed/deleted some above for directory in downloads: if "LL.(" in directory and not directory.endswith('.fail'): bookID = str(directory).split("LL.(")[1].split(")")[0] logger.debug("Book with id: " + str(bookID) + " is in downloads") pp_path = os.path.join(processpath, directory) if os.path.isfile(pp_path): pp_path = os.path.join(processpath) if (os.path.isdir(pp_path)): logger.debug('Found LL folder %s.' % pp_path) if import_book(pp_path, bookID): ppcount = ppcount + 1 if ppcount == 0: logger.info('No snatched books/mags have been found') else: logger.info('%s book%s/mag%s processed.' % (ppcount, plural(ppcount), plural(ppcount))) if reset: scheduleJob(action='Restart', target='processDir')
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = getList(lazylibrarian.REJECT_WORDS) author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) matches = [] for nzb in resultlist: nzb_Title = unaccented_str(replace_all(nzb['nzbtitle'], dictrepl)).strip() nzb_Title = re.sub(r"\s\s+", " ", nzb_Title) # remove extra whitespace nzbAuthor_match = fuzz.token_set_ratio(author, nzb_Title) nzbBook_match = fuzz.token_set_ratio(title, nzb_Title) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzb_Title)) nzburl = nzb['nzburl'] rejected = False already_failed = myDB.action('SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl).fetchone() if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzb_Title, already_failed['NZBprov'])) rejected = True if not rejected: for word in reject_list: if word in nzb_Title.lower() and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzb_Title, word)) break nzbsize_temp = nzb['nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) maxsize = check_int(lazylibrarian.REJECT_MAXSIZE, 0) if not rejected: if maxsize and nzbsize > maxsize: rejected = True logger.debug("Rejecting %s, too large" % nzb_Title) if not rejected: if nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio: bookid = book['bookid'] nzbTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } score = (nzbBook_match + nzbAuthor_match)/2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(getList(nzb_Title)) words -= len(getList(author)) words -= len(getList(title)) score -= abs(words) matches.append([score, nzb_Title, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] logger.info(u'Best match NZB (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]).fetchone() if snatchedbooks: logger.debug('%s already marked snatched' % nzb_Title) return True # someone else found it else: logger.debug('%s adding to wanted' % nzb_Title) myDB.upsert("wanted", newValueDict, controlValueDict) if nzbmode == "torznab": snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) else: snatch = NZBDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) if snatch: notifiers.notify_snatch(newValueDict["NZBtitle"] + ' at ' + now()) scheduleJob(action='Start', target='processDir') return True + True # we found it else: logger.debug("No nzb's found for [%s] using searchtype %s" % (book["searchterm"], searchtype)) return False
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss try: threadname = threading.currentThread().name if "Thread-" in threadname: if mags is None: threading.currentThread().name = "SEARCHALLMAG" else: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Regex, LastAcquired, \ IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, Regex, LastAcquired, IssueDate from magazines \ WHERE Title=? AND Status="Active"', (magazine['bookid'], )) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: threading.currentThread().name = "WEBSERVER" return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug("Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] if not searchterm: dic = { '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '' } # strip accents from the magazine title for easier name-matching searchterm = unaccented_str(searchmag['Title']) if not searchterm: # unless it's not a latin-1 encodable name searchterm = searchmag['Title'] searchterm = replace_all(searchterm, dic) searchterm = re.sub('[.\-/]', ' ', searchterm) searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if not searchlist: logger.warn( 'There is nothing to search for. Mark some magazines as active.' ) for book in searchlist: resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_NZB_MSG, 0) + 1200 < timenow: logger.warn( 'No nzb providers are available. Check config and blocklist' ) lazylibrarian.NO_NZB_MSG = timenow if lazylibrarian.USE_DIRECT(): dir_resultlist, nproviders = IterateOverDirectSites( book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_DIRECT_MSG, 0) + 1200 < timenow: logger.warn( 'No direct providers are available. Check config and blocklist' ) lazylibrarian.NO_DIRECT_MSG = timenow if dir_resultlist: for item in dir_resultlist: # reformat the results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites( book, 'mag') if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_TOR_MSG, 0) + 1200 < timenow: logger.warn( 'No tor providers are available. Check config and blocklist' ) lazylibrarian.NO_TOR_MSG = timenow if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites() if not nproviders: # don't nag. Show warning message no more than every 20 mins timenow = int(time.time()) if check_int(lazylibrarian.NO_RSS_MSG, 0) + 1200 < timenow: logger.warn( 'No rss providers are available. Check config and blocklist' ) lazylibrarian.NO_RSS_MSG = timenow if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item[ 'tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("No results for magazine %s" % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 rejects = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] bookid = '' for nzb in resultlist: total_nzbs += 1 bookid = nzb['bookid'] # strip accents from the magazine title for easier name-matching nzbtitle = unaccented_str(nzb['nzbtitle']) if not nzbtitle: # unless it's not a latin-1 encodable name nzbtitle = nzb['nzbtitle'] nzbtitle = nzbtitle.replace('"', '').replace( "'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int( nzbsize_temp, 1000 ) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match( 'SELECT * from magazines WHERE Title=?', (bookid, )) if not results: logger.debug( 'Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name += 1 else: rejected = False maxsize = check_int( lazylibrarian.CONFIG['REJECT_MAGSIZE'], 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: minsize = check_int( lazylibrarian.CONFIG['REJECT_MAGMIN'], 0) if minsize and nzbsize < minsize: logger.debug("Rejecting %s, too small" % nzbtitle) rejected = True if not rejected: dic = { '.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '' } nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # remove extra spaces if they're in a row if nzbtitle_formatted and nzbtitle_formatted[ 0] == '[' and nzbtitle_formatted[-1] == ']': nzbtitle_formatted = nzbtitle_formatted[1:-1] nzbtitle_exploded_temp = " ".join( nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split( ' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb has magazine title and a date/issue nr # eg The MagPI July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( unaccented(bookid), unaccented(nzbtitle_formatted)) if mag_title_match < check_int( lazylibrarian.CONFIG['MATCH_RATIO'], 90): logger.debug( u"Magazine token set Match failed: " + str(mag_title_match) + "% for " + nzbtitle_formatted) rejected = True else: logger.debug(u"Magazine matched: " + str(mag_title_match) + "% " + bookid + " for " + nzbtitle_formatted) else: logger.debug("Magazine name too short (%s)" % len(nzbtitle_exploded)) rejected = True if not rejected: blocked = myDB.match( 'SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (nzburl, )) if blocked: logger.debug( "Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, blocked['NZBprov'])) rejected = True if not rejected: reject_list = getList( str(results['Reject']).lower()) reject_list += getList( lazylibrarian.CONFIG['REJECT_MAGS']) lower_title = unaccented( nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() if reject_list: if lazylibrarian.LOGLEVEL > 2: logger.debug('Reject: %s' % str(reject_list)) logger.debug('Title: %s' % lower_title) logger.debug('Bookid: %s' % lower_bookid) for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break regex_pass = 0 if not rejected: # Magazine names have many different styles of date # DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY # MonthName DD YYYY or MonthName DD, YYYY # YYYY MM or YYYY MM DD # Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn # nn YYYY issue number without "Nr" before it # issue and year as a single 6 digit string eg 222015 newdatish = "none" # DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and pos: month = month2num(nzbtitle_exploded[pos - 1]) if month: if pos - 1: day = check_int( nzbtitle_exploded[pos - 2], 1) if day > 31: # probably issue number nn day = 1 else: day = 1 newdatish = "%04d-%02d-%02d" % ( year, month, day) try: _ = datetime.date(year, month, day) regex_pass = 1 break except ValueError: regex_pass = 0 pos += 1 # MonthName DD YYYY or MonthName DD, YYYY if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and (pos - 1): month = month2num( nzbtitle_exploded[pos - 2]) if month: day = check_int( nzbtitle_exploded[ pos - 1].rstrip(','), 1) try: _ = datetime.date( year, month, day) newdatish = "%04d-%02d-%02d" % ( year, month, day) regex_pass = 2 break except ValueError: regex_pass = 0 pos += 1 # YYYY MM or YYYY MM DD if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year and pos + 1 < len( nzbtitle_exploded): month = check_int( nzbtitle_exploded[pos + 1], 0) if month: if pos + 2 < len( nzbtitle_exploded): day = check_int( nzbtitle_exploded[pos + 2], 1) else: day = 1 try: _ = datetime.date( year, month, day) newdatish = "%04d-%02d-%02d" % ( year, month, day) regex_pass = 3 break except ValueError: regex_pass = 0 pos += 1 # Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): if nzbtitle_exploded[pos].lower() in [ "issue", "no", "nr", "vol" ]: if pos + 1 < len(nzbtitle_exploded): issue = check_int( nzbtitle_exploded[pos + 1], 0) if issue: newdatish = str( issue) # 4 == 04 == 004 if pos + 2 < len( nzbtitle_exploded): year = check_year( nzbtitle_exploded[pos + 2]) if year and year < int( datetime.date. today().year): newdatish = '0' # it's old regex_pass = 4 # Issue/No/Nr/Vol nn, YYYY else: regex_pass = 5 # Issue/No/Nr/Vol nn break pos += 1 # nn YYYY issue number without "Nr" before it if not regex_pass: pos = 1 while pos < len(nzbtitle_exploded): year = check_year(nzbtitle_exploded[pos]) if year: issue = check_int( nzbtitle_exploded[pos - 1], 0) if issue: newdatish = str( issue) # 4 == 04 == 004 regex_pass = 6 if year < int(datetime.date.today( ).year): newdatish = '0' # it's old break pos += 1 # issue and year as a single 6 digit string eg 222015 if not regex_pass: pos = 0 while pos < len(nzbtitle_exploded): issue = nzbtitle_exploded[pos] if issue.isdigit() and len(issue) == 6: year = int(issue[2:]) issue = int(issue[:2]) newdatish = str( issue) # 4 == 04 == 004 regex_pass = 7 if year < int( datetime.date.today().year): newdatish = '0' # it's old break pos += 1 if not regex_pass: logger.debug( 'Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date += 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues regex_pass = 99 if rejected: rejects += 1 else: if lazylibrarian.LOGLEVEL > 2: logger.debug("regex %s [%s] %s" % (regex_pass, nzbtitle_formatted, newdatish)) # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" control_date = results['IssueDate'] if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if str(newdatish).isdigit(): logger.debug( 'Magazine comparing issue numbers (%s)' % newdatish) control_date = 0 elif re.match('\d+-\d\d-\d\d', str(newdatish)): start_time = time.time() start_time -= int( lazylibrarian.CONFIG['MAG_AGE'] ) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime( "%Y-%m-%d", time.localtime(start_time)) logger.debug( 'Magazine date comparing to %s' % control_date) else: logger.debug( 'Magazine unable to find comparison type [%s]' % newdatish) control_date = 0 if str(control_date).isdigit() and str( newdatish).isdigit(): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill( 4) # pad so we sort correctly elif re.match('\d+-\d\d-\d\d', str(control_date)) and \ re.match('\d+-\d\d-\d\d', str(newdatish)): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare( newdatish, control_date) else: # invalid comparison of date and issue number if re.match('\d+-\d\d-\d\d', str(control_date)): logger.debug( 'Magazine %s failed: Expecting a date' % nzbtitle_formatted) else: logger.debug( 'Magazine %s failed: Expecting issue number' % nzbtitle_formatted) bad_date += 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date += 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug( 'This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) logger.debug('Magazine request number %s' % len(issues)) if lazylibrarian.LOGLEVEL > 2: logger.debug(str(issues)) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug( 'This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug( 'This issue of %s is old; skipping.' % nzbtitle_formatted) old_date += 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.match( 'SELECT * from %s WHERE NZBtitle=? and NZBprov=?' % insert_table, (nzbtitle, nzbprov)) if mag_entry: if lazylibrarian.LOGLEVEL > 2: logger.debug( '%s is already in %s marked %s' % (nzbtitle, insert_table, insert_status)) else: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) if lazylibrarian.LOGLEVEL > 2: logger.debug('Added %s to %s marked %s' % (nzbtitle, insert_table, insert_status)) msg = 'Found %i result%s for %s. %i new,' % ( total_nzbs, plural(total_nzbs), bookid, new_date) msg += ' %i old, %i fail date, %i fail name,' % ( old_date, bad_date, bad_name) msg += ' %i rejected: %i to download' % (rejects, len(maglist)) logger.info(msg) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod(magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') else: snatch = NZBDownloadMethod(magazine['bookid'], magazine['nzbtitle'], magazine['nzburl'], 'magazine') if snatch: logger.info( 'Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("Magazine %s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) custom_notify_snatch(magazine['bookid']) scheduleJob(action='Start', target='processDir') if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc()) finally: threading.currentThread().name = "WEBSERVER"
def find_results(self, authorname=None, queue=None): threading.currentThread().name = "GB-SEARCH" resultlist = [] # See if we should check ISBN field, otherwise ignore it try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): api_strings = ["isbn:"] else: api_strings = ["inauthor:", "intitle:"] except: api_strings = ["inauthor:", "intitle:"] api_hits = 0 logger.debug("Now searching Google Books API with keyword: " + self.name) for api_value in api_strings: startindex = 0 if api_value == "isbn:": set_url = self.url + urllib.quote(api_value + self.name) else: set_url = self.url + urllib.quote(api_value + '"' + self.name + '"') try: startindex = 0 resultcount = 0 # removedResults = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 while startindex < number_results: self.params["startIndex"] = startindex URL = set_url + "&" + urllib.urlencode(self.params) try: jsonresults, in_cache = self.get_request(URL) if not in_cache: api_hits = api_hits + 1 number_results = jsonresults["totalItems"] logger.debug("Searching url: " + URL) if number_results == 0: logger.warn("Found no results for %s with value: %s" % (api_value, self.name)) break else: pass except HTTPError as err: logger.warn("Google Books API Error [%s]: Check your API key or wait a while" % err.msg) break startindex = startindex + 40 for item in jsonresults["items"]: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item["volumeInfo"]["authors"][0] except KeyError: logger.debug("Skipped a result without authorfield.") no_author_count = no_author_count + 1 continue valid_langs = [valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(",")] if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = item["volumeInfo"]["language"] if booklang not in valid_langs: logger.debug("Skipped a book with language %s" % booklang) ignored = ignored + 1 continue except KeyError: ignored = ignored + 1 logger.debug("Skipped a result where no language is found") continue try: bookpub = item["volumeInfo"]["publisher"] except KeyError: bookpub = None try: booksub = item["volumeInfo"]["subtitle"] except KeyError: booksub = None try: bookdate = item["volumeInfo"]["publishedDate"] except KeyError: bookdate = "0000-00-00" bookdate = bookdate[:4] try: bookimg = item["volumeInfo"]["imageLinks"]["thumbnail"] except KeyError: bookimg = "images/nocover.png" try: bookrate = item["volumeInfo"]["averageRating"] except KeyError: bookrate = 0 try: bookpages = item["volumeInfo"]["pageCount"] except KeyError: bookpages = "0" try: bookgenre = item["volumeInfo"]["categories"][0] except KeyError: bookgenre = None try: bookdesc = item["volumeInfo"]["description"] except KeyError: bookdesc = "Not available" try: num_reviews = item["volumeInfo"]["ratingsCount"] except KeyError: num_reviews = 0 try: if item["volumeInfo"]["industryIdentifiers"][0]["type"] == "ISBN_10": bookisbn = item["volumeInfo"]["industryIdentifiers"][0]["identifier"] else: bookisbn = 0 except KeyError: bookisbn = 0 author_fuzz = fuzz.token_set_ratio(Author, authorname) book_fuzz = fuzz.token_set_ratio(item["volumeInfo"]["title"], authorname) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) # Darkie67: # replacing German Umlauts and filtering out ":" # # PAB I think this is to avoid removing the book for bad characters? # Do we need to remove books with bad characters in their name anyway? # Use unidecode as a more thorough way to strip and see what still gets marked "bad" # # booknamealt = item['volumeInfo']['title'] # booknametmp1=booknamealt.replace(u'\xf6',u'oe') # booknametmp2=booknametmp1.replace(u'\xe4',u'ae') # booknametmp3=booknametmp2.replace(u'\xdf',u'ss') # booknametmp4=booknametmp3.replace(u'\xc4',u'Ae') # booknametmp5=booknametmp4.replace(u'\xdc',u'Ue') # booknametmp6=booknametmp5.replace(u'\xd6',u'Oe') # booknametmp7=booknametmp6.replace(':','') # bookname=booknametmp7.replace(u'\xfc',u'ue') bookname = item["volumeInfo"]["title"] bookname = bookname.replace(":", "").replace('"', "").replace("'", "") bookname = unidecode(u"%s" % bookname) bookname = bookname.strip() # strip whitespace # Darkie67 end resultlist.append( { "authorname": Author, "bookid": item["id"], "bookname": bookname, "booksub": booksub, "bookisbn": bookisbn, "bookpub": bookpub, "bookdate": bookdate, "booklang": booklang, "booklink": item["volumeInfo"]["canonicalVolumeLink"], "bookrate": float(bookrate), "bookimg": bookimg, "bookpages": bookpages, "bookgenre": bookgenre, "bookdesc": bookdesc, "author_fuzz": author_fuzz, "book_fuzz": book_fuzz, "isbn_fuzz": isbn_fuzz, "highest_fuzz": highest_fuzz, "num_reviews": num_reviews, } ) resultcount = resultcount + 1 except KeyError: break logger.debug("Found %s total results" % total_count) logger.debug("Removed %s bad language results" % ignored) logger.debug("Removed %s books with no author" % no_author_count) logger.debug("Showing %s results for (%s) with keyword: %s" % (resultcount, api_value, authorname)) logger.debug("The Google Books API was hit %s times for keyword %s" % (str(api_hits), self.name)) queue.put(resultlist)
def searchItem(item=None, bookid=None, cat=None): """ Call all active search providers to search for item return a list of results, each entry in list containing percentage_match, title, provider, size, url item = searchterm to use for general search bookid = link to data for book/audio searches cat = category to search [general, book, audio] """ results = [] if not item: return results book = {} searchterm = unaccented_str(item) book['searchterm'] = searchterm if bookid: book['bookid'] = bookid else: book['bookid'] = searchterm if cat in ['book', 'audio']: myDB = database.DBConnection() cmd = 'SELECT authorName,bookName,bookSub from books,authors WHERE books.AuthorID=authors.AuthorID' cmd += ' and bookID=?' match = myDB.match(cmd, (bookid,)) if match: book['authorName'] = match['authorName'] book['bookName'] = match['bookName'] book['bookSub'] = match['bookSub'] else: logger.debug('Forcing general search') cat = 'general' nprov = lazylibrarian.USE_NZB() + lazylibrarian.USE_TOR() + lazylibrarian.USE_RSS() + lazylibrarian.USE_DIRECT() logger.debug('Searching %s provider%s (%s) for %s' % (nprov, plural(nprov), cat, searchterm)) if lazylibrarian.USE_NZB(): resultlist, nprov = IterateOverNewzNabSites(book, cat) if nprov: results += resultlist if lazylibrarian.USE_TOR(): resultlist, nprov = IterateOverTorrentSites(book, cat) if nprov: results += resultlist if lazylibrarian.USE_DIRECT(): resultlist, nprov = IterateOverDirectSites(book, cat) if nprov: results += resultlist if lazylibrarian.USE_RSS(): resultlist, nprov, dltypes = IterateOverRSSSites() if nprov and dltypes != 'M': results += resultlist # reprocess to get consistent results searchresults = [] for item in results: provider = '' title = '' url = '' size = '' date = '' mode = '' if 'dispname' in item: provider = item['dispname'] elif 'nzbprov' in item: provider = item['nzbprov'] elif 'tor_prov' in item: provider = item['tor_prov'] elif 'rss_prov' in item: provider = item['rss_prov'] if 'nzbtitle' in item: title = item['nzbtitle'] if 'nzburl' in item: url = item['nzburl'] if 'nzbsize' in item: size = item['nzbsize'] if 'nzbdate' in item: date = item['nzbdate'] if 'nzbmode' in item: mode = item['nzbmode'] if 'tor_title' in item: title = item['tor_title'] if 'tor_url' in item: url = item['tor_url'] if 'tor_size' in item: size = item['tor_size'] if 'tor_date' in item: date = item['tor_date'] if 'tor_type' in item: mode = item['tor_type'] if title and provider and mode and url: # Not all results have a date or a size if not date: date = 'Fri, 01 Jan 1970 00:00:00 +0100' if not size: size = '1000' url = url.encode('utf-8') if mode == 'torznab': # noinspection PyTypeChecker if url.startswith('magnet'): mode = 'magnet' # calculate match percentage - torrents might have words_with_underscore_separator score = fuzz.token_set_ratio(searchterm, title.replace('_', ' ')) # lose a point for each extra word in the title so we get the closest match words = len(getList(searchterm)) words -= len(getList(title)) score -= abs(words) if score >= 40: # ignore wildly wrong results? result = {'score': score, 'title': title, 'provider': provider, 'size': size, 'date': date, 'url': quote_plus(url), 'mode': mode} searchresults.append(result) logger.debug('Found %s %s results for %s' % (len(searchresults), cat, searchterm)) return searchresults
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": " ", "*": "", "(": "", ")": "", "[": "", "]": "", "#": "", "0": "", "1": "", "2": "", "3": "", "4": "", "5": "", "6": "", "7": "", "8": "", "9": "", "'": "", ":": "", "!": "", "-": " ", "\s\s": " ", } # ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', # ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = { "...": "", ".": " ", " & ": " ", " = ": " ", "?": "", "$": "s", " + ": " ", '"': "", ",": "", "*": "", ":": "", ";": "", } match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for tor in resultlist: tor_Title = formatter.latinToAscii(formatter.replace_all(str(tor["tor_title"]), dictrepl)).strip() tor_Title = re.sub(r"\s\s+", " ", tor_Title) # remove extra whitespace author = formatter.latinToAscii(formatter.replace_all(book["authorName"], dic)) title = formatter.latinToAscii(formatter.replace_all(book["bookName"], dic)) torAuthor_match = fuzz.token_set_ratio(author, tor_Title) torBook_match = fuzz.token_set_ratio(title, tor_Title) logger.debug(u"TOR author/book Match: %s/%s for %s" % (torAuthor_match, torBook_match, tor_Title)) # tor_Title_match = fuzz.token_set_ratio(book['searchterm'], tor_Title) # logger.debug("Torrent Title Match %: " + str(tor_Title_match) + " for " + tor_Title) # if (tor_Title_match >= match_ratio): rejected = False for word in reject_list: if word in tor_Title.lower() and not word in author.lower() and not word in title_lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (tor_Title, word)) break if torAuthor_match >= match_ratio and torBook_match >= match_ratio and not rejected: logger.debug(u"Found Torrent: %s using %s search" % (tor["tor_title"], searchtype)) bookid = book["bookid"] tor_Title = (author + " - " + title + " LL.(" + book["bookid"] + ")").strip() tor_url = tor["tor_url"] tor_prov = tor["tor_prov"] tor_size_temp = tor["tor_size"] # Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = str(round(float(tor_size_temp) / 1048576, 2)) + " MB" controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped", } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid ).fetchone() if not snatchedbooks: snatch = TORDownloadMethod(bookid, tor_prov, tor_Title, tor_url) if snatch: notifiers.notify_snatch(formatter.latinToAscii(tor_Title) + " at " + formatter.now()) common.schedule_job(action="Start", target="processDir") return True logger.debug( "No torrent's found for " + (book["authorName"] + " " + book["bookName"]).strip() + " using searchtype " + searchtype ) return False
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' ' } # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': '' } match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = getList(lazylibrarian.REJECT_WORDS) author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) matches = [] for nzb in resultlist: nzb_Title = unaccented_str(replace_all(nzb['nzbtitle'], dictrepl)).strip() nzb_Title = re.sub(r"\s\s+", " ", nzb_Title) # remove extra whitespace nzbAuthor_match = fuzz.token_set_ratio(author, nzb_Title) nzbBook_match = fuzz.token_set_ratio(title, nzb_Title) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzb_Title)) nzburl = nzb['nzburl'] rejected = False already_failed = myDB.action( 'SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl).fetchone() if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzb_Title, already_failed['NZBprov'])) rejected = True if not rejected: for word in reject_list: if word in nzb_Title.lower( ) and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzb_Title, word)) break nzbsize_temp = nzb[ 'nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) maxsize = check_int(lazylibrarian.REJECT_MAXSIZE, 0) if not rejected: if maxsize and nzbsize > maxsize: rejected = True logger.debug("Rejecting %s, too large" % nzb_Title) if not rejected: if nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio: bookid = book['bookid'] nzbTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } score = (nzbBook_match + nzbAuthor_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(getList(nzb_Title)) words -= len(getList(author)) words -= len(getList(title)) score -= abs(words) matches.append( [score, nzb_Title, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] logger.info(u'Best match NZB (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) snatchedbooks = myDB.action( 'SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]).fetchone() if snatchedbooks: logger.debug('%s already marked snatched' % nzb_Title) return True # someone else found it else: logger.debug('%s adding to wanted' % nzb_Title) myDB.upsert("wanted", newValueDict, controlValueDict) if nzbmode == "torznab": snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) else: snatch = NZBDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) if snatch: notifiers.notify_snatch(newValueDict["NZBtitle"] + ' at ' + now()) scheduleJob(action='Start', target='processDir') return True + True # we found it else: logger.debug("No nzb's found for [%s] using searchtype %s" % (book["searchterm"], searchtype)) return False
def LibraryScan(startdir=None): """ Scan a directory tree adding new books into database Return how many books you added """ try: destdir = lazylibrarian.DIRECTORY('Destination') if not startdir: if not destdir: logger.warn('Cannot find destination directory: %s. Not scanning' % destdir) return 0 startdir = destdir if not os.path.isdir(startdir): logger.warn('Cannot find directory: %s. Not scanning' % startdir) return 0 if not internet(): logger.warn('Libraryscan: No internet connection') return 0 myDB = database.DBConnection() # keep statistics of full library scans if startdir == destdir: myDB.action('DELETE from stats') try: # remove any extra whitespace in authornames authors = myDB.select('SELECT AuthorID,AuthorName FROM authors WHERE AuthorName like "% %"') if authors: logger.info('Removing extra spaces from %s authorname%s' % (len(authors), plural(len(authors)))) for author in authors: authorid = author["AuthorID"] authorname = ' '.join(author['AuthorName'].split()) # Have we got author name both with-and-without extra spaces? If so, merge them duplicate = myDB.match( 'Select AuthorID,AuthorName FROM authors WHERE AuthorName="%s"' % authorname) if duplicate: myDB.action('DELETE from authors where authorname="%s"' % author['AuthorName']) if author['AuthorID'] != duplicate['AuthorID']: myDB.action('UPDATE books set AuthorID="%s" WHERE AuthorID="%s"' % (duplicate['AuthorID'], author['AuthorID'])) else: myDB.action( 'UPDATE authors set AuthorName="%s" WHERE AuthorID="%s"' % (authorname, authorid)) except Exception as e: logger.info('Error: ' + str(e)) logger.info('Scanning ebook directory: %s' % startdir) new_book_count = 0 modified_count = 0 rescan_count = 0 rescan_hits = 0 file_count = 0 author = "" if lazylibrarian.CONFIG['FULL_SCAN']: cmd = 'select AuthorName, BookName, BookFile, BookID from books,authors' cmd += ' where books.AuthorID = authors.AuthorID and books.Status="Open"' if not startdir == destdir: cmd += ' and BookFile like "' + startdir + '%"' books = myDB.select(cmd) status = lazylibrarian.CONFIG['NOTFOUND_STATUS'] logger.info('Missing books will be marked as %s' % status) for book in books: bookID = book['BookID'] bookfile = book['BookFile'] if not (bookfile and os.path.isfile(bookfile)): myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID)) myDB.action('update books set BookFile="" where BookID="%s"' % bookID) logger.warn('Book %s - %s updated as not found on disk' % (book['AuthorName'], book['BookName'])) # to save repeat-scans of the same directory if it contains multiple formats of the same book, # keep track of which directories we've already looked at processed_subdirectories = [] warned = False # have we warned about no new authors setting matchString = '' for char in lazylibrarian.CONFIG['EBOOK_DEST_FILE']: matchString = matchString + '\\' + char # massage the EBOOK_DEST_FILE config parameter into something we can use # with regular expression matching booktypes = '' count = -1 booktype_list = getList(lazylibrarian.CONFIG['EBOOK_TYPE']) for book_type in booktype_list: count += 1 if count == 0: booktypes = book_type else: booktypes = booktypes + '|' + book_type matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace( "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']' pattern = re.compile(matchString, re.VERBOSE) for r, d, f in os.walk(startdir): for directory in d[:]: # prevent magazine being scanned if directory.startswith("_") or directory.startswith("."): d.remove(directory) for files in f: file_count += 1 if isinstance(r, str): r = r.decode(lazylibrarian.SYS_ENCODING) subdirectory = r.replace(startdir, '') # Added new code to skip if we've done this directory before. # Made this conditional with a switch in config.ini # in case user keeps multiple different books in the same subdirectory if lazylibrarian.CONFIG['IMP_SINGLEBOOK'] and (subdirectory in processed_subdirectories): logger.debug("[%s] already scanned" % subdirectory) else: # If this is a book, try to get author/title/isbn/language # if epub or mobi, read metadata from the book # If metadata.opf exists, use that allowing it to override # embedded metadata. User may have edited metadata.opf # to merge author aliases together # If all else fails, try pattern match for author/title # and look up isbn/lang from LT or GR later match = 0 if is_valid_booktype(files): logger.debug("[%s] Now scanning subdirectory %s" % (startdir, subdirectory)) language = "Unknown" isbn = "" book = "" author = "" gr_id = "" gb_id = "" extn = os.path.splitext(files)[1] # if it's an epub or a mobi we can try to read metadata from it if (extn == ".epub") or (extn == ".mobi"): book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING) try: res = get_book_info(book_filename) except Exception as e: logger.debug('get_book_info failed for %s, %s' % (book_filename, str(e))) res = {} # title and creator are the minimum we need if 'title' in res and 'creator' in res: book = res['title'] author = res['creator'] if book and len(book) > 2 and author and len(author) > 2: match = 1 if 'language' in res: language = res['language'] if 'identifier' in res: isbn = res['identifier'] if 'type' in res: extn = res['type'] logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" % (isbn, language, author, book, extn)) if not match: logger.debug("Book meta incomplete in %s" % book_filename) # calibre uses "metadata.opf", LL uses "bookname - authorname.opf" # just look for any .opf file in the current directory since we don't know # LL preferred authorname/bookname at this point. # Allow metadata in file to override book contents as may be users pref metafile = opf_file(r) try: res = get_book_info(metafile) except Exception as e: logger.debug('get_book_info failed for %s, %s' % (metafile, str(e))) res = {} # title and creator are the minimum we need if 'title' in res and 'creator' in res: book = res['title'] author = res['creator'] if book and len(book) > 2 and author and len(author) > 2: match = 1 if 'language' in res: language = res['language'] if 'identifier' in res: isbn = res['identifier'] if 'gr_id' in res: gr_id = res['gr_id'] logger.debug("file meta [%s] [%s] [%s] [%s] [%s]" % (isbn, language, author, book, gr_id)) if not match: logger.debug("File meta incomplete in %s" % metafile) if not match: # no author/book from metadata file, and not embedded either match = pattern.match(files) if match: author = match.group("author") book = match.group("book") if len(book) <= 2 or len(author) <= 2: match = 0 if not match: logger.debug("Pattern match failed [%s]" % files) if match: # flag that we found a book in this subdirectory processed_subdirectories.append(subdirectory) # If we have a valid looking isbn, and language != "Unknown", add it to cache if language != "Unknown" and is_valid_isbn(isbn): logger.debug("Found Language [%s] ISBN [%s]" % (language, isbn)) # we need to add it to language cache if not already # there, is_valid_isbn has checked length is 10 or 13 if len(isbn) == 10: isbnhead = isbn[0:3] else: isbnhead = isbn[3:6] match = myDB.match('SELECT lang FROM languages where isbn = "%s"' % isbnhead) if not match: myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, language)) logger.debug("Cached Lang [%s] ISBN [%s]" % (language, isbnhead)) else: logger.debug("Already cached Lang [%s] ISBN [%s]" % (language, isbnhead)) author, authorid, new = addAuthorNameToDB(author) # get the author name as we know it... if author: # author exists, check if this book by this author is in our database # metadata might have quotes in book name # some books might be stored under a different author name # eg books by multiple authors, books where author is "writing as" # or books we moved to "merge" authors book = book.replace("'", "") # First try and find it under author and bookname # as we may have it under a different bookid or isbn to goodreads/googlebooks # which might have several bookid/isbn for the same book bookid = find_book_in_db(myDB, author, book) if not bookid: # Title or author name might not match or multiple authors # See if the gr_id, gb_id is already in our database if gr_id: bookid = gr_id elif gb_id: bookid = gb_id else: bookid = "" if bookid: match = myDB.match('SELECT BookID FROM books where BookID = "%s"' % bookid) if not match: msg = 'Unable to find book %s by %s in database, trying to add it using ' if bookid == gr_id: msg += "GoodReads ID " + gr_id if bookid == gb_id: msg += "GoogleBooks ID " + gb_id logger.debug(msg % (book, author)) if lazylibrarian.CONFIG['BOOK_API'] == "GoodReads" and gr_id: GR_ID = GoodReads(gr_id) GR_ID.find_book(gr_id, None) elif lazylibrarian.CONFIG['BOOK_API'] == "GoogleBooks" and gb_id: GB_ID = GoogleBooks(gb_id) GB_ID.find_book(gb_id, None) # see if it's there now... match = myDB.match('SELECT BookID from books where BookID="%s"' % bookid) if not match: logger.debug("Unable to add bookid %s to database" % bookid) bookid = "" if not bookid and isbn: # See if the isbn is in our database match = myDB.match('SELECT BookID FROM books where BookIsbn = "%s"' % isbn) if match: bookid = match['BookID'] if not bookid: # get author name from parent directory of this book directory newauthor = os.path.basename(os.path.dirname(r)) # calibre replaces trailing periods with _ eg Smith Jr. -> Smith Jr_ if newauthor.endswith('_'): newauthor = newauthor[:-1] + '.' if author.lower() != newauthor.lower(): logger.debug("Trying authorname [%s]" % newauthor) bookid = find_book_in_db(myDB, newauthor, book) if bookid: logger.warn("%s not found under [%s], found under [%s]" % (book, author, newauthor)) # at this point if we still have no bookid, it looks like we # have author and book title but no database entry for it if not bookid: if lazylibrarian.CONFIG['BOOK_API'] == "GoodReads": # Either goodreads doesn't have the book or it didn't match language prefs # Since we have the book anyway, try and reload it ignoring language prefs rescan_count += 1 base_url = 'http://www.goodreads.com/search.xml?q=' params = {"key": lazylibrarian.CONFIG['GR_API']} if author[1] in '. ': surname = author forename = '' while surname[1] in '. ': forename = forename + surname[0] + '.' surname = surname[2:].strip() if author != forename + ' ' + surname: logger.debug('Stripped authorname [%s] to [%s %s]' % (author, forename, surname)) author = forename + ' ' + surname author = ' '.join(author.split()) # ensure no extra whitespace searchname = author + ' ' + book searchname = cleanName(unaccented(searchname)) searchterm = urllib.quote_plus(searchname.encode(lazylibrarian.SYS_ENCODING)) set_url = base_url + searchterm + '&' + urllib.urlencode(params) try: rootxml, in_cache = get_xml_request(set_url) if not len(rootxml): logger.debug("Error requesting results from GoodReads") else: resultxml = rootxml.getiterator('work') for item in resultxml: booktitle = item.find('./best_book/title').text book_fuzz = fuzz.token_set_ratio(booktitle, book) if book_fuzz >= 98: logger.debug("Rescan found %s : %s" % (booktitle, language)) rescan_hits += 1 bookid = item.find('./best_book/id').text GR_ID = GoodReads(bookid) GR_ID.find_book(bookid, None) if language and language != "Unknown": # set language from book metadata logger.debug("Setting language from metadata %s : %s" % (booktitle, language)) myDB.action('UPDATE books SET BookLang="%s" WHERE BookID="%s"' % (language, bookid)) break if not bookid: logger.warn("GoodReads doesn't know about %s" % book) except Exception as e: logger.error("Error finding rescan results: %s" % str(e)) elif lazylibrarian.CONFIG['BOOK_API'] == "GoogleBooks": # if we get here using googlebooks it's because googlebooks # doesn't have the book. No point in looking for it again. logger.warn("GoogleBooks doesn't know about %s" % book) # see if it's there now... if bookid: cmd = 'SELECT books.Status, BookFile, AuthorName, BookName from books,authors ' cmd += 'where books.AuthorID = authors.AuthorID and BookID="%s"' % bookid check_status = myDB.match(cmd) if not check_status: logger.debug('Unable to find bookid %s in database' % bookid) else: if check_status['Status'] != 'Open': # we found a new book new_book_count += 1 myDB.action( 'UPDATE books set Status="Open" where BookID="%s"' % bookid) # store book location so we can check if it gets removed book_filename = os.path.join(r, files) if not check_status['BookFile']: # no previous location myDB.action('UPDATE books set BookFile="%s" where BookID="%s"' % (book_filename, bookid)) # location may have changed since last scan elif book_filename != check_status['BookFile']: modified_count += 1 logger.warn("Updating book location for %s %s from %s to %s" % (author, book, check_status['BookFile'], book_filename)) logger.debug("%s %s matched %s BookID %s, [%s][%s]" % (author, book, check_status['Status'], bookid, check_status['AuthorName'], check_status['BookName'])) myDB.action('UPDATE books set BookFile="%s" where BookID="%s"' % (book_filename, bookid)) # update cover file to cover.jpg in book folder (if exists) bookdir = os.path.dirname(book_filename) coverimg = os.path.join(bookdir, 'cover.jpg') if os.path.isfile(coverimg): cachedir = lazylibrarian.CACHEDIR cacheimg = os.path.join(cachedir, 'book', bookid + '.jpg') copyfile(coverimg, cacheimg) else: logger.warn( "Failed to match book [%s] by [%s] in database" % (book, author)) else: if not warned and not lazylibrarian.CONFIG['ADD_AUTHOR']: logger.warn("Add authors to database is disabled") warned = True logger.info("%s/%s new/modified book%s found and added to the database" % (new_book_count, modified_count, plural(new_book_count + modified_count))) logger.info("%s file%s processed" % (file_count, plural(file_count))) if startdir == destdir: # On full library scans, check for missing workpages setWorkPages() # and books with unknown language nolang = myDB.match( "select count('BookID') as counter from Books where status='Open' and BookLang='Unknown'") nolang = nolang['counter'] if nolang: logger.warn("Found %s book%s in your library with unknown language" % (nolang, plural(nolang))) # show stats if new books were added stats = myDB.match( "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \ sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached), sum(duplicates) FROM stats") st= {'GR_book_hits': stats['sum(GR_book_hits)'], 'GB_book_hits': stats['sum(GR_book_hits)'], 'GR_lang_hits': stats['sum(GR_lang_hits)'], 'LT_lang_hits': stats['sum(LT_lang_hits)'], 'GB_lang_change': stats['sum(GB_lang_change)'], 'cache_hits': stats['sum(cache_hits)'], 'bad_lang': stats['sum(bad_lang)'], 'bad_char': stats['sum(bad_char)'], 'uncached': stats['sum(uncached)'], 'duplicates': stats['sum(duplicates)']} for item in st.keys(): if st[item] is None: st[item] = 0 if lazylibrarian.CONFIG['BOOK_API'] == "GoogleBooks": logger.debug("GoogleBooks was hit %s time%s for books" % (st['GR_book_hits'], plural(st['GR_book_hits']))) logger.debug("GoogleBooks language was changed %s time%s" % (st['GB_lang_change'], plural(st['GB_lang_change']))) if lazylibrarian.CONFIG['BOOK_API'] == "GoodReads": logger.debug("GoodReads was hit %s time%s for books" % (st['GR_book_hits'], plural(st['GR_book_hits']))) logger.debug("GoodReads was hit %s time%s for languages" % (st['GR_lang_hits'], plural(st['GR_lang_hits']))) logger.debug("LibraryThing was hit %s time%s for languages" % (st['LT_lang_hits'], plural(st['LT_lang_hits']))) logger.debug("Language cache was hit %s time%s" % (st['cache_hits'], plural(st['cache_hits']))) logger.debug("Unwanted language removed %s book%s" % (st['bad_lang'], plural(st['bad_lang']))) logger.debug("Unwanted characters removed %s book%s" % (st['bad_char'], plural(st['bad_char']))) logger.debug("Unable to cache language for %s book%s with missing ISBN" % (st['uncached'], plural(st['uncached']))) logger.debug("Found %s duplicate book%s" % (st['duplicates'], plural(st['duplicates']))) logger.debug("Rescan %s hit%s, %s miss" % (rescan_hits, plural(rescan_hits), rescan_count - rescan_hits)) logger.debug("Cache %s hit%s, %s miss" % (lazylibrarian.CACHE_HIT, plural(lazylibrarian.CACHE_HIT), lazylibrarian.CACHE_MISS)) cachesize = myDB.match("select count('ISBN') as counter from languages") logger.debug("ISBN Language cache holds %s entries" % cachesize['counter']) # Cache any covers and images images = myDB.select('select bookid, bookimg, bookname from books where bookimg like "http%"') if len(images): logger.info("Caching cover%s for %i book%s" % (plural(len(images)), len(images), plural(len(images)))) for item in images: bookid = item['bookid'] bookimg = item['bookimg'] # bookname = item['bookname'] newimg, success = cache_img("book", bookid, bookimg) if success: myDB.action('update books set BookImg="%s" where BookID="%s"' % (newimg, bookid)) images = myDB.select('select AuthorID, AuthorImg, AuthorName from authors where AuthorImg like "http%"') if len(images): logger.info("Caching image%s for %i author%s" % (plural(len(images)), len(images), plural(len(images)))) for item in images: authorid = item['authorid'] authorimg = item['authorimg'] # authorname = item['authorname'] newimg, success = cache_img("author", authorid, authorimg) if success: myDB.action('update authors set AuthorImg="%s" where AuthorID="%s"' % (newimg, authorid)) # On full scan, update bookcounts for all authors, not just new ones - refresh may have located # new books for existing authors especially if switched provider gb/gr or changed wanted languages authors = myDB.select('select AuthorID from authors') else: # On single author/book import, just update bookcount for that author authors = myDB.select('select AuthorID from authors where AuthorName = "%s"' % author.replace('"', '""')) logger.debug('Updating bookcounts for %i author%s' % (len(authors), plural(len(authors)))) for author in authors: update_totals(author['AuthorID']) logger.info('Library scan complete') return new_book_count except Exception: logger.error('Unhandled exception in libraryScan: %s' % traceback.format_exc())
def processDir(reset=False): try: threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "POSTPROCESS" processpath = lazylibrarian.DIRECTORY('Download') logger.debug('Checking [%s] for files to post process' % processpath) try: downloads = os.listdir(processpath) except OSError as why: logger.error('Could not access [%s] directory [%s]' % (processpath, why.strerror)) return myDB = database.DBConnection() snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if len(snatched) == 0: logger.info('Nothing marked as snatched.') scheduleJob(action='Stop', target='processDir') return if len(downloads) == 0: logger.info('No downloads are found. Nothing to process yet.') return logger.info("Checking %s download%s for %s snatched file%s" % (len(downloads), plural(len(downloads)), len(snatched), plural(len(snatched)))) ppcount = 0 for book in snatched: # if torrent, see if we can get current status from the downloader as the name # may have been changed once magnet resolved, or download started or completed # depending on torrent downloader. Usenet doesn't change the name. We like usenet. torrentname = '' try: logger.debug("%s was sent to %s" % (book['NZBtitle'], book['Source'])) if book['Source'] == 'TRANSMISSION': torrentname = transmission.getTorrentFolder(book['DownloadID']) elif book['Source'] == 'UTORRENT': torrentname = utorrent.nameTorrent(book['DownloadID']) elif book['Source'] == 'RTORRENT': torrentname = rtorrent.getName(book['DownloadID']) elif book['Source'] == 'QBITTORRENT': torrentname = qbittorrent.getName(book['DownloadID']) elif book['Source'] == 'SYNOLOGY_TOR': torrentname = synology.getName(book['DownloadID']) elif book['Source'] == 'DELUGEWEBUI': torrentname = deluge.getTorrentFolder(book['DownloadID']) elif book['Source'] == 'DELUGERPC': client = DelugeRPCClient(lazylibrarian.DELUGE_HOST, int(lazylibrarian.DELUGE_PORT), lazylibrarian.DELUGE_USER, lazylibrarian.DELUGE_PASS) try: client.connect() result = client.call('core.get_torrent_status', book['DownloadID'], {}) # for item in result: # logger.debug ('Deluge RPC result %s: %s' % (item, result[item])) if 'name' in result: torrentname = unaccented_str(result['name']) except Exception as e: logger.debug('DelugeRPC failed %s' % str(e)) except Exception as e: logger.debug("Failed to get updated torrent name from %s for %s: %s" % (book['Source'], book['DownloadID'], str(e))) matchtitle = unaccented_str(book['NZBtitle']) if torrentname and torrentname != matchtitle: logger.debug("%s Changing [%s] to [%s]" % (book['Source'], matchtitle, torrentname)) myDB.action('UPDATE wanted SET NZBtitle = "%s" WHERE NZBurl = "%s"' % (torrentname, book['NZBurl'])) matchtitle = torrentname # here we could also check percentage downloaded or eta or status? # If downloader says it hasn't completed, no need to look for it. matches = [] logger.info('Looking for %s in %s' % (matchtitle, processpath)) for fname in downloads: # skip if failed before or incomplete torrents, or incomplete btsync extn = os.path.splitext(fname)[1] if extn not in ['.fail', '.part', '.bts', '.!ut']: # This is to get round differences in torrent filenames. # Usenet is ok, but Torrents aren't always returned with the name we searched for # We ask the torrent downloader for the torrent name, but don't always get an answer # so we try to do a "best match" on the name, there might be a better way... if isinstance(fname, str): matchname = fname.decode(lazylibrarian.SYS_ENCODING) else: matchname = fname if ' LL.(' in matchname: matchname = matchname.split(' LL.(')[0] match = 0 if matchtitle: if ' LL.(' in matchtitle: matchtitle = matchtitle.split(' LL.(')[0] match = fuzz.token_set_ratio(matchtitle, matchname) if match and match >= lazylibrarian.DLOAD_RATIO: fname = matchname if os.path.isfile(os.path.join(processpath, fname)): # handle single file downloads here. Book/mag file in download root. # move the file into it's own subdirectory so we don't move/delete things that aren't ours logger.debug('filename [%s] is a file' % os.path.join(processpath, fname)) if is_valid_booktype(fname, booktype="book") \ or is_valid_booktype(fname, booktype="mag"): logger.debug('filename [%s] is a valid book/mag' % os.path.join(processpath, fname)) if bts_file(processpath): logger.debug("Skipping %s, found a .bts file" % processpath) else: fname = os.path.splitext(fname)[0] dirname = os.path.join(processpath, fname) if not os.path.exists(dirname): try: os.makedirs(dirname) setperm(dirname) except OSError as why: logger.debug('Failed to create directory %s, %s' % (dirname, why.strerror)) if os.path.exists(dirname): # move the book and any related files too # ie other book formats, or opf, jpg with same title # can't move metadata.opf or cover.jpg or similar # as can't be sure they are ours # not sure if we need a new listdir here, or whether we can use the old one list_dir = os.listdir(processpath) for ourfile in list_dir: if ourfile.startswith(fname): if is_valid_booktype(ourfile, booktype="book") \ or is_valid_booktype(ourfile, booktype="mag") \ or os.path.splitext(ourfile)[1].lower() in ['.opf', '.jpg']: try: if lazylibrarian.DESTINATION_COPY: shutil.copyfile(os.path.join(processpath, ourfile), os.path.join(dirname, ourfile)) setperm(os.path.join(dirname, ourfile)) else: shutil.move(os.path.join(processpath, ourfile), os.path.join(dirname, ourfile)) setperm(os.path.join(dirname, ourfile)) except Exception as why: logger.debug("Failed to copy/move file %s to %s, %s" % (ourfile, dirname, str(why))) pp_path = os.path.join(processpath, fname) if os.path.isdir(pp_path): logger.debug('Found folder (%s%%) %s for %s' % (match, pp_path, matchtitle)) if not os.listdir(pp_path): logger.debug("Skipping %s, folder is empty" % pp_path) elif bts_file(pp_path): logger.debug("Skipping %s, found a .bts file" % pp_path) else: matches.append([match, pp_path, book]) else: pp_path = os.path.join(processpath, fname) matches.append([match, pp_path, book]) # so we can report closest match else: logger.debug('Skipping %s' % fname) match = 0 if matches: highest = max(matches, key=lambda x: x[0]) match = highest[0] pp_path = highest[1] book = highest[2] if match and match >= lazylibrarian.DLOAD_RATIO: logger.debug(u'Found match (%s%%): %s for %s' % (match, pp_path, book['NZBtitle'])) data = myDB.match('SELECT * from books WHERE BookID="%s"' % book['BookID']) if data: # it's a book logger.debug(u'Processing book %s' % book['BookID']) authorname = data['AuthorName'] bookname = data['BookName'] if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER: logger.warn('Please check your EBOOK_DEST_FOLDER setting') lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\') # Default destination path, should be allowed change per config file. dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace( '$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace( '$Title', bookname) global_name = unaccented(global_name) # dest_path = authorname+'/'+bookname # global_name = bookname + ' - ' + authorname # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} dest_path = unaccented_str(replace_all(dest_path, dic)) dest_path = os.path.join(processpath, dest_path).encode(lazylibrarian.SYS_ENCODING) else: data = myDB.match('SELECT * from magazines WHERE Title="%s"' % book['BookID']) if data: # it's a magazine logger.debug(u'Processing magazine %s' % book['BookID']) # AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple # files are downloading, there will be an error in post-processing, trying to go to the # same directory. mostrecentissue = data['IssueDate'] # keep for processing issues arriving out of order # Remove characters we don't want in the filename before (maybe) adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} mag_name = unaccented_str(replace_all(book['BookID'], dic)) # book auxinfo is a cleaned date, eg 2015-01-01 dest_path = lazylibrarian.MAG_DEST_FOLDER.replace( '$IssueDate', book['AuxInfo']).replace('$Title', mag_name) if lazylibrarian.MAG_RELATIVE: if dest_path[0] not in '._': dest_path = '_' + dest_path dest_path = os.path.join(processpath, dest_path).encode( lazylibrarian.SYS_ENCODING) else: dest_path = dest_path.encode(lazylibrarian.SYS_ENCODING) authorname = None bookname = None global_name = lazylibrarian.MAG_DEST_FILE.replace('$IssueDate', book['AuxInfo']).replace( '$Title', mag_name) global_name = unaccented(global_name) else: # not recognised logger.debug('Nothing in database matching "%s"' % book['BookID']) continue else: logger.debug("Snatched %s %s is not in download directory" % (book['NZBmode'], book['NZBtitle'])) if match: logger.debug(u'Closest match (%s%%): %s' % (match, pp_path)) #for match in matches: # logger.info('Match: %s%% %s' % (match[0], match[1])) continue processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name) if processBook: logger.debug("Processing %s, %s" % (global_name, book['NZBurl'])) # update nzbs, only update the snatched ones in case multiple matches for same book / magazine issue controlValueDict = {"BookID": book['BookID'], "NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = {"Status": "Processed", "NZBDate": now()} # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) if bookname: # it's a book, if None it's a magazine if len(lazylibrarian.IMP_CALIBREDB): logger.debug('Calibre should have created the extras for us') else: processExtras(myDB, dest_path, global_name, data) else: # update mags controlValueDict = {"Title": book['BookID']} if mostrecentissue: if mostrecentissue.isdigit() and str(book['AuxInfo']).isdigit(): older = int(mostrecentissue) > int(book['AuxInfo']) # issuenumber else: older = mostrecentissue > book['AuxInfo'] # YYYY-MM-DD else: older = False if older: # check this in case processing issues arriving out of order newValueDict = {"LastAcquired": today(), "IssueStatus": "Open"} else: newValueDict = {"IssueDate": book['AuxInfo'], "LastAcquired": today(), "IssueStatus": "Open"} myDB.upsert("magazines", newValueDict, controlValueDict) # dest_path is where we put the magazine after processing, but we don't have the full filename # so look for any "book" in that directory dest_file = book_file(dest_path, booktype='mag') controlValueDict = {"Title": book['BookID'], "IssueDate": book['AuxInfo']} newValueDict = {"IssueAcquired": today(), "IssueFile": dest_file, "IssueID": create_id("%s %s" % (book['BookID'], book['AuxInfo'])) } myDB.upsert("issues", newValueDict, controlValueDict) # create a thumbnail cover for the new issue create_cover(dest_file) # calibre or ll copied/moved the files we want, now delete source files to_delete = True if book['NZBmode'] in ['torrent', 'magnet']: # Only delete torrents if we don't want to keep seeding if lazylibrarian.KEEP_SEEDING: logger.warn('%s is seeding %s %s' % (book['Source'], book['NZBmode'], book['NZBtitle'])) to_delete = False else: # ask downloader to delete the torrent, but not the files # we may delete them later, depending on other settings if book['DownloadID'] != "unknown": logger.debug('Removing %s from %s' % (book['NZBtitle'], book['Source'].lower())) delete_task(book['Source'], book['DownloadID'], False) else: logger.warn("Unable to remove %s from %s, no DownloadID" % (book['NZBtitle'], book['Source'].lower())) if to_delete: # only delete the files if not in download root dir and if DESTINATION_COPY not set if not lazylibrarian.DESTINATION_COPY and (pp_path != processpath): if os.path.isdir(pp_path): # calibre might have already deleted it? try: shutil.rmtree(pp_path) except Exception as why: logger.debug("Unable to remove %s, %s" % (pp_path, str(why))) logger.info('Successfully processed: %s' % global_name) ppcount = ppcount + 1 notify_download("%s from %s at %s" % (global_name, book['NZBprov'], now())) else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s.fail' % pp_path) controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = {"Status": "Failed", "NZBDate": now()} myDB.upsert("wanted", newValueDict, controlValueDict) # if it's a book, reset status so we try for a different version # if it's a magazine, user can select a different one from pastissues table if bookname: myDB.action('UPDATE books SET status = "Wanted" WHERE BookID="%s"' % book['BookID']) # at this point, as it failed we should move it or it will get postprocessed # again (and fail again) try: os.rename(pp_path, pp_path + '.fail') except Exception as e: logger.debug("Unable to rename %s, %s" % (pp_path, str(e))) downloads = os.listdir(processpath) # check in case we processed/deleted some above for directory in downloads: dname, extn = os.path.splitext(directory) if "LL.(" in dname and extn not in ['.fail', '.part', '.bts', '.!ut']: bookID = str(directory).split("LL.(")[1].split(")")[0] logger.debug("Book with id: " + str(bookID) + " found in download directory") pp_path = os.path.join(processpath, directory) if os.path.isfile(pp_path): pp_path = os.path.join(processpath) if (os.path.isdir(pp_path)): if import_book(pp_path, bookID): ppcount = ppcount + 1 if ppcount == 0: logger.info('No snatched books/mags have been found') else: logger.info('%s book%s/mag%s processed.' % (ppcount, plural(ppcount), plural(ppcount))) # Now check for any that are still marked snatched... if lazylibrarian.TASK_AGE: snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if len(snatched) > 0: for snatch in snatched: # FUTURE: we could check percentage downloaded or eta? # if percentage is increasing, it's just slow try: when_snatched = time.strptime(snatch['NZBdate'], '%Y-%m-%d %H:%M:%S') when_snatched = time.mktime(when_snatched) diff = time.time() - when_snatched # time difference in seconds except: diff = 0 hours = int(diff / 3600) if hours >= lazylibrarian.TASK_AGE: logger.warn('%s was sent to %s %s hours ago, deleting failed task' % (snatch['NZBtitle'], snatch['Source'].lower(), hours)) # change status to "Failed", and ask downloader to delete task and files if snatch['BookID'] != 'unknown': myDB.action('UPDATE wanted SET Status="Failed" WHERE BookID="%s"' % snatch['BookID']) myDB.action('UPDATE books SET status = "Wanted" WHERE BookID="%s"' % snatch['BookID']) delete_task(snatch['Source'], snatch['DownloadID'], True) if reset: scheduleJob(action='Restart', target='processDir') except Exception as e: logger.error('Unhandled exception in processDir: %s' % traceback.format_exc())
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} # ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', # ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) matches = [] for tor in resultlist: torTitle = formatter.latinToAscii(formatter.replace_all(str(tor['tor_title']), dictrepl)).strip() torTitle = re.sub(r"\s\s+", " ", torTitle) # remove extra whitespace author = formatter.latinToAscii(formatter.replace_all(book['authorName'], dic)) title = formatter.latinToAscii(formatter.replace_all(book['bookName'], dic)) torAuthor_match = fuzz.token_set_ratio(author, torTitle) torBook_match = fuzz.token_set_ratio(title, torTitle) logger.debug(u"TOR author/book Match: %s/%s for %s" % (torAuthor_match, torBook_match, torTitle)) rejected = False for word in reject_list: if word in torTitle.lower() and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (torTitle, word)) break if (torAuthor_match >= match_ratio and torBook_match >= match_ratio and not rejected): #logger.debug(u'Found Torrent: %s using %s search' % (tor['tor_title'], searchtype)) bookid = book['bookid'] tor_Title = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() tor_url = tor['tor_url'] tor_prov = tor['tor_prov'] tor_size_temp = tor['tor_size'] # Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = str(round(float(tor_size_temp) / 1048576, 2)) + ' MB' controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } score = (torBook_match + torAuthor_match)/2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(formatter.getList(torTitle)) words -= len(formatter.getList(author)) words -= len(formatter.getList(title)) score -= abs(words) matches.append([score, torTitle, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] logger.info(u'Best match TOR (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]).fetchone() if not snatchedbooks: snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) if snatch: notifiers.notify_snatch(newValueDict["NZBtitle"] + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') return True logger.debug("No torrent's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + " using searchtype " + searchtype) return False
def processResultList(resultlist, authorname, bookname, book, searchtype): myDB = database.DBConnection() dictrepl = { '...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' ' } match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = getList(lazylibrarian.REJECT_WORDS) matches = [] # bit of a misnomer now, rss can search both tor and nzb rss feeds for tor in resultlist: torTitle = unaccented_str(replace_all(tor['tor_title'], dictrepl)).strip() torTitle = re.sub(r"\s\s+", " ", torTitle) # remove extra whitespace tor_Author_match = fuzz.token_set_ratio(authorname, torTitle) tor_Title_match = fuzz.token_set_ratio(bookname, torTitle) logger.debug("RSS Author/Title Match: %s/%s for %s" % (tor_Author_match, tor_Title_match, torTitle)) tor_url = tor['tor_url'] rejected = False already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % tor_url) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (torTitle, already_failed['NZBprov'])) rejected = True if not rejected: for word in reject_list: if word in torTitle.lower() and word not in authorname.lower( ) and word not in bookname.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (torTitle, word)) break tor_size_temp = tor[ 'tor_size'] # Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = round(float(tor_size_temp) / 1048576, 2) maxsize = check_int(lazylibrarian.REJECT_MAXSIZE, 0) if not rejected: if maxsize and tor_size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % torTitle) if not rejected: bookid = book['bookid'] tor_Title = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() tor_prov = tor['tor_prov'] tor_feed = tor['tor_feed'] controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } score = (tor_Title_match + tor_Author_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(getList(torTitle)) words -= len(getList(authorname)) words -= len(getList(bookname)) score -= abs(words) matches.append([score, torTitle, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] if score < match_ratio: logger.debug( u'Nearest RSS match (%s%%): %s using %s search for %s %s' % (score, nzb_Title, searchtype, authorname, bookname)) return False logger.info(u'Best RSS match (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) snatchedbooks = myDB.match( 'SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]) if snatchedbooks: # check if one of the other downloaders got there first logger.info('%s already marked snatched' % nzb_Title) return True else: myDB.upsert("wanted", newValueDict, controlValueDict) tor_url = controlValueDict["NZBurl"] if '.nzb' in tor_url: snatch = NZBDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) else: """ # http://baconbits.org/torrents.php?action=download&authkey=<authkey>&torrent_pass=<password.hashed>&id=185398 if not tor_url.startswith('magnet'): # magnets don't use auth pwd = lazylibrarian.RSS_PROV[tor_feed]['PASS'] auth = lazylibrarian.RSS_PROV[tor_feed]['AUTH'] # don't know what form of password hash is required, try sha1 tor_url = tor_url.replace('<authkey>', auth).replace('<password.hashed>', sha1(pwd)) """ snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], tor_url) if snatch: logger.info( 'Downloading %s from %s' % (newValueDict["NZBtitle"], newValueDict["NZBprov"])) notify_snatch( "%s from %s at %s" % (newValueDict["NZBtitle"], newValueDict["NZBprov"], now())) scheduleJob(action='Start', target='processDir') return True + True # we found it else: logger.debug("No RSS found for " + (book["authorName"] + ' ' + book['bookName']).strip()) return False
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss try: threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, Regex, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, Regex, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug(u"Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Regex'] if not searchterm: searchterm = searchmag['Title'] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = unaccented_str(replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites(book, 'mag') if not nproviders: logger.warn('No rss providers are set. Check config for RSS providers') if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item['tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_name = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = unaccented_str(nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] nzbsize_temp = check_int(nzbsize_temp, 1000) # not all torrents returned by torznab have a size nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match('SELECT * from magazines WHERE Title="%s"' % bookid) if not results: logger.debug('Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_name = bad_name + 1 else: rejected = False maxsize = check_int(lazylibrarian.REJECT_MAGSIZE, 0) if maxsize and nzbsize > maxsize: logger.debug("Rejecting %s, too large" % nzbtitle) rejected = True if not rejected: control_date = results['IssueDate'] reject_list = getList(results['Reject']) dic = {'.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': ''} nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title followed by a date # eg The MagPI Issue 22 - July 2015 if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( unaccented(bookid), unaccented(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug( u"Magazine token set Match failed: " + str( mag_title_match) + "% for " + nzbtitle_formatted) rejected = True else: logger.debug( u"Magazine matched: " + str( mag_title_match) + "% " + bookid + " for " + nzbtitle_formatted) else: rejected = True if not rejected: already_failed = myDB.match('SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, already_failed['NZBprov'])) rejected = True if not rejected: lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if not rejected: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # or other words we don't want. Should make the word list configurable. # so strip all the trailing junk... strip_list = ['pdf', 'true', 'truepdf', 'german', 'ebooks'] while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() in strip_list: nzbtitle_exploded.pop() # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = month2num(unaccented(regexA_month_temp)) if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'fail' # force date failure # if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].rstrip(',').zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY # else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months, or month <1 or >12 # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY or MonthName DD, YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = month2num(unaccented(regexB_month_temp)) regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].rstrip(',').zfill(2) if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexC_month = 0 regexC_day = 0 if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day)) except Exception: # regexD Issue/No/Nr/Vol nn, YYYY or Issue/No/Nr/Vol nn try: IssueLabel = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if IssueLabel.lower() in ["issue", "no", "nr", "vol"]: # issue nn regexD_issue = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexD_issue.isdigit(): newdatish = str(int(regexD_issue)) # 4 == 04 == 004 else: IssueLabel = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if IssueLabel.lower() in ["issue", "no", "nr", "vol"]: # issue nn, YYYY regexD_issue = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexD_issue = regexD_issue.strip(',') if regexD_issue.isdigit(): newdatish = str(int(regexD_issue)) # 4 == 04 == 004 else: raise ValueError regexD_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexD_year.isdigit(): if int(regexD_year) < int(datetime.date.today().year): newdatish = 0 # it's old else: raise ValueError except Exception: # regexE nn YYYY issue number without "Nr" before it # nn is assumed not to be a month as they are normally names not digits try: regexE_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexE_issue = nzbtitle_exploded[len(nzbtitle_exploded) - 2] print "[%s][%s]" % (regexE_year, regexE_issue) if regexE_issue.isdigit(): newdatish = int(regexE_issue) if int(regexE_year) < int(datetime.date.today().year): newdatish = 0 # it's old else: raise ValueError except Exception: # regexF issue and year as a single 6 digit string eg 222015 try: regexF = nzbtitle_exploded[len(nzbtitle_exploded) - 1] if regexF.isdigit() and len(regexF) == 6: regexF_issue = regexF[:2] regexF_year = regexF[2:] newdatish = str(int(regexF_issue)) # 4 == 04 == 004 if int(regexF_year) < int(datetime.date.today().year): newdatish = 0 # it's old else: raise ValueError except Exception: logger.debug('Magazine %s not in a recognised date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: logger.debug('Magazine [%s] does not match the search term [%s].' % ( nzbtitle_formatted, bookid)) bad_name = bad_name + 1 continue # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if '-' in str(newdatish): start_time = time.time() start_time -= int(lazylibrarian.MAG_AGE) * 24 * 60 * 60 # number of seconds in days if start_time < 0: # limit of unixtime (1st Jan 1970) start_time = 0 control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) logger.debug('Magazine date comparing to %s' % control_date) else: control_date = 0 if '-' in str(control_date) and '-' in str(newdatish): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(newdatish, control_date) elif '-' not in str(control_date) and '-' not in str(newdatish): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill(4) # pad so we sort correctly else: # invalid comparison of date and issue number logger.debug('Magazine %s incorrect date or issue format.' % nzbtitle_formatted) bad_date = bad_date + 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.select('SELECT * from %s WHERE NZBtitle="%s" and NZBprov="%s"' % ( insert_table, nzbtitle, nzbprov)) if not mag_entry: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) else: # logger.debug('Magazine [%s] was rejected.' % nzbtitle_formatted) bad_name = bad_name + 1 logger.info('Found %i result%s for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % ( total_nzbs, plural(total_nzbs), bookid, new_date, old_date, bad_date, bad_name, len(maglist))) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod( magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) else: snatch = NZBDownloadMethod( magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("%s from %s at %s" % (unaccented(magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='processDir') maglist = [] if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete") except Exception as e: logger.error('Unhandled exception in search_magazines: %s' % traceback.format_exc())
def processResultList(resultlist, author, title, book): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} # ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', # ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) matches = [] # bit of a misnomer now, rss can search both tor and nzb rss feeds for tor in resultlist: torTitle = formatter.latinToAscii(formatter.replace_all(tor['tor_title'], dictrepl)).strip() torTitle = re.sub(r"\s\s+", " ", torTitle) # remove extra whitespace tor_Author_match = fuzz.token_set_ratio(author, torTitle) tor_Title_match = fuzz.token_set_ratio(title, torTitle) logger.debug("RSS Author/Title Match: %s/%s for %s" % (tor_Author_match, tor_Title_match, torTitle)) rejected = False for word in reject_list: if word in torTitle.lower() and not word in author.lower() and not word in book.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (torTitle, word)) break tor_size_temp = tor['tor_size'] # Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = round(float(tor_size_temp) / 1048576, 2) maxsize = formatter.check_int(lazylibrarian.REJECT_MAXSIZE, 0) if maxsize and tor_size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % torTitle) if (tor_Title_match >= match_ratio and tor_Author_match >= match_ratio and not rejected): #logger.debug(u'Found RSS: %s' % tor['tor_title']) bookid = book['bookid'] tor_Title = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() tor_url = tor['tor_url'] tor_prov = tor['tor_prov'] tor_feed = tor['tor_feed'] controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } score = (tor_Title_match + tor_Author_match)/2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(formatter.getList(torTitle)) words -= len(formatter.getList(author)) words -= len(formatter.getList(title)) score -= abs(words) matches.append([score, torTitle, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] logger.info(u'Best match RSS (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]).fetchone() if not snatchedbooks: # check if one of the other downloaders got there first tor_url = controlValueDict["NZBurl"] if '.nzb' in tor_url: snatch = NZBDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) else: """ # http://baconbits.org/torrents.php?action=download&authkey=<authkey>&torrent_pass=<password.hashed>&id=185398 if not tor_url.startswith('magnet'): # magnets don't use auth pwd = lazylibrarian.RSS_PROV[tor_feed]['PASS'] auth = lazylibrarian.RSS_PROV[tor_feed]['AUTH'] # don't know what form of password hash is required, try sha1 tor_url = tor_url.replace('<authkey>', auth).replace('<password.hashed>', sha1(pwd)) """ snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBprov"], newValueDict["NZBtitle"], tor_url) if snatch: notifiers.notify_snatch(newValueDict["NZBtitle"] + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') return True logger.debug("No RSS found for " + (book["authorName"] + ' ' + book['bookName']).strip()) return False
def processDir(force=False, reset=False): # rename this thread threading.currentThread().name = "POSTPROCESS" if not lazylibrarian.DOWNLOAD_DIR or not os.path.isdir(lazylibrarian.DOWNLOAD_DIR): processpath = os.getcwd() else: processpath = lazylibrarian.DOWNLOAD_DIR logger.debug(' Checking [%s] for files to post process' % processpath) try: downloads = os.listdir(processpath) except OSError as why: logger.error('Could not access [%s] directory [%s]' % (processpath, why.strerror)) return False myDB = database.DBConnection() snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if force is False and len(snatched) == 0: logger.info('Nothing marked as snatched. Stopping postprocessor job.') common.schedule_job(action='Stop', target='processDir') elif len(downloads) == 0: logger.info('No downloads are found. Nothing to process.') else: logger.debug("Checking %s downloads for %s snatched files" % (len(downloads), len(snatched))) ppcount = 0 for book in snatched: found = False for fname in downloads: if not fname.endswith('.fail'): # has this failed before? # this is to get round differences in torrent filenames. # Torrents aren't always returned with the name we searched for # there might be a better way... if isinstance(fname, str): matchname = fname.decode(lazylibrarian.SYS_ENCODING) else: matchname = fname if ' LL.(' in matchname: matchname = matchname.split(' LL.(')[0] matchtitle = book['NZBtitle'] if ' LL.(' in matchtitle: matchtitle = matchtitle.split(' LL.(')[0] match = fuzz.token_set_ratio(matchtitle, matchname) if match >= 95: fname = matchname if os.path.isfile(os.path.join(processpath, fname)): # handle single file downloads here... if formatter.is_valid_booktype(fname, booktype="book") \ or formatter.is_valid_booktype(fname, booktype="mag"): dirname = os.path.join(processpath, os.path.splitext(fname)[0]) if not os.path.exists(dirname): try: os.makedirs(dirname) except OSError as why: logger.debug('Failed to create directory %s, %s' % (dirname, why.strerror)) if os.path.exists(dirname): try: shutil.move(os.path.join(processpath, fname), os.path.join(dirname, fname)) fname = os.path.splitext(fname)[0] except Exception as why: logger.debug("Failed to move file %s to %s, %s" % (fname, dirname, str(why))) if os.path.isdir(os.path.join(processpath, fname)): pp_path = os.path.join(processpath, fname) logger.debug('Found folder %s for %s' % (pp_path, book['NZBtitle'])) found = True break else: logger.debug('No match (%s%%) %s for %s' % (match, matchname, matchtitle)) else: logger.debug('Skipping %s' % fname) if found: data = myDB.select('SELECT * from books WHERE BookID="%s"' % book['BookID']) if data: authorname = data[0]['AuthorName'] bookname = data[0]['BookName'] if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER: logger.warn('Please check your EBOOK_DEST_FOLDER setting') lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\') # Default destination path, should be allowed change per config file. dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace( '$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace( '$Title', bookname) global_name = common.remove_accents(global_name) # dest_path = authorname+'/'+bookname # global_name = bookname + ' - ' + authorname # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic)) dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode( lazylibrarian.SYS_ENCODING) else: data = myDB.select('SELECT * from magazines WHERE Title="%s"' % book['BookID']) if data: # AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple # files are downloading, there will be an error in post-processing, trying to go to the # same directory. mostrecentissue = data[0]['IssueDate'] # keep for processing issues arriving out of order # Remove characters we don't want in the filename before (maybe) adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} mag_name = formatter.latinToAscii(formatter.replace_all(book['BookID'], dic)) # book auxinfo is a cleaned date, eg 2015-01-01 dest_path = lazylibrarian.MAG_DEST_FOLDER.replace( '$IssueDate', book['AuxInfo']).replace('$Title', mag_name) # dest_path = '_Magazines/'+title+'/'+book['AuxInfo'] if lazylibrarian.MAG_RELATIVE: if dest_path[0] not in '._': dest_path = '_' + dest_path dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode( lazylibrarian.SYS_ENCODING) else: dest_path = dest_path.encode(lazylibrarian.SYS_ENCODING) authorname = None bookname = None global_name = lazylibrarian.MAG_DEST_FILE.replace('$IssueDate', book['AuxInfo']).replace( '$Title', mag_name) global_name = common.remove_accents(global_name) # global_name = book['AuxInfo']+' - '+title else: logger.debug("Snatched magazine %s is not in download directory" % (book['BookID'])) continue else: logger.debug("Snatched %s %s is not in download directory" % (book['NZBmode'], book['NZBtitle'])) continue # try: # os.chmod(dest_path, 0777) # except Exception, e: # logger.debug("Could not chmod post-process directory: " + str(dest_path)) processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name) if processBook: logger.debug("Processing %s, %s" % (global_name, book['NZBurl'])) # update nzbs, only update the snatched ones in case multiple matches for same book / magazine issue controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = {"Status": "Processed", "NZBDate": formatter.now()} # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) if bookname is not None: # it's a book, if None it's a magazine processExtras(myDB, dest_path, global_name, data) else: # update mags controlValueDict = {"Title": book['BookID']} if mostrecentissue > book['AuxInfo']: # check this in case processing issues arriving out of order newValueDict = {"LastAcquired": formatter.today(), "IssueStatus": "Open"} else: newValueDict = {"IssueDate": book['AuxInfo'], "LastAcquired": formatter.today(), "IssueStatus": "Open"} myDB.upsert("magazines", newValueDict, controlValueDict) # dest_path is where we put the magazine after processing, but we don't have the full filename # so look for any "book" in that directory dest_file = book_file(dest_path, booktype='mag') controlValueDict = {"Title": book['BookID'], "IssueDate": book['AuxInfo']} newValueDict = {"IssueAcquired": formatter.today(), "IssueFile": dest_file, "IssueID": magazinescan.create_id("%s %s" % (book['BookID'], book['AuxInfo'])) } myDB.upsert("issues", newValueDict, controlValueDict) # create a thumbnail cover for the new issue magazinescan.create_cover(dest_file) logger.info('Successfully processed: %s' % global_name) ppcount = ppcount + 1 notifiers.notify_download(formatter.latinToAscii(global_name) + ' at ' + formatter.now()) else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s.fail' % pp_path) # at this point, as it failed we should move it or it will get postprocessed # again (and fail again) try: os.rename(pp_path, pp_path + '.fail') except: logger.debug("Unable to rename %s" % pp_path) downloads = os.listdir(processpath) # check in case we processed/deleted some above for directory in downloads: if "LL.(" in directory and not directory.endswith('.fail'): bookID = str(directory).split("LL.(")[1].split(")")[0] logger.debug("Book with id: " + str(bookID) + " is in downloads") pp_path = os.path.join(processpath, directory) if os.path.isfile(pp_path): pp_path = os.path.join(processpath) if (os.path.isdir(pp_path)): logger.debug('Found LL folder %s.' % pp_path) if import_book(pp_path, bookID): ppcount = ppcount + 1 if ppcount: logger.info('%s books/mags have been processed.' % ppcount) else: logger.info('No snatched books/mags have been found') if reset: common.schedule_job(action='Restart', target='processDir')
def find_results(self, authorname=None, queue=None): resultlist = [] api_hits = 0 # Goodreads doesn't like initials followed by spaces, # eg "M L Hamilton", needs "M. L. Hamilton" or "M.L.Hamilton" # but DOES need spaces if not initials eg "Tom.Holt" fails, but "Tom Holt" works if authorname[1] == ' ': authorname = authorname.replace(' ', '.') authorname = authorname.replace('..', '.') url = urllib.quote_plus(authorname.encode(lazylibrarian.SYS_ENCODING)) set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params) logger.debug('Now searching GoodReads API with keyword: ' + authorname) logger.debug('Searching for %s at: %s' % (authorname, set_url)) try: try: rootxml, in_cache = get_xml_request(set_url) except Exception as e: logger.error("Error finding results: %s" % e) return if not len(rootxml): logger.debug("Error requesting results") return resultxml = rootxml.getiterator('work') resultcount = 0 for author in resultxml: bookdate = "0001-01-01" if (author.find('original_publication_year').text is None): bookdate = "0000" else: bookdate = author.find('original_publication_year').text authorNameResult = author.find('./best_book/author/name').text booksub = "" bookpub = "" booklang = "Unknown" try: bookimg = author.find('./best_book/image_url').text if (bookimg == 'http://www.goodreads.com/assets/nocover/111x148.png'): bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' try: bookrate = author.find('average_rating').text except KeyError: bookrate = 0 bookpages = '0' bookgenre = '' bookdesc = '' bookisbn = '' booklink = 'http://www.goodreads.com/book/show/' + author.find('./best_book/id').text if (author.find('./best_book/title').text is None): bookTitle = "" else: bookTitle = author.find('./best_book/title').text author_fuzz = fuzz.token_set_ratio(authorNameResult, authorname) book_fuzz = fuzz.token_set_ratio(bookTitle, authorname) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) bookid = author.find('./best_book/id').text resultlist.append({ 'authorname': author.find('./best_book/author/name').text, 'bookid': bookid, 'authorid': author.find('./best_book/author/id').text, 'bookname': bookTitle.encode("ascii", "ignore"), 'booksub': None, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': booklink, 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': float(bookrate) }) resultcount = resultcount + 1 except urllib2.HTTPError as err: if err.code == 404: logger.error('Received a 404 error when searching for author') if err.code == 403: logger.warn('Access to api is denied: usage exceeded') else: logger.error('An unexpected error has occurred when searching for an author') logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), authorname)) logger.debug('The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), authorname)) queue.put(resultlist)
def find_results(self, searchterm=None, queue=None): """ GoogleBooks performs much better if we search for author OR title not both at once, so if searchterm is not isbn, two searches needed. Lazylibrarian searches use <ll> to separate title from author in searchterm If this token isn't present, it's an isbn or searchterm as supplied by user """ try: myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(searchterm): api_strings = ['isbn:'] api_hits = 0 ignored = 0 total_count = 0 no_author_count = 0 title = '' authorname = '' if ' <ll> ' in searchterm: # special token separates title from author title, authorname = searchterm.split(' <ll> ') fullterm = searchterm.replace(' <ll> ', ' ') logger.debug('Now searching Google Books API with searchterm: %s' % fullterm) for api_value in api_strings: set_url = self.url if api_value == "isbn:": set_url = set_url + quote(api_value + searchterm) elif api_value == 'intitle:': searchterm = fullterm if title: # just search for title # noinspection PyUnresolvedReferences title = title.split(' (')[0] # without any series info searchterm = title searchterm = searchterm.replace("'", "").replace('"', '').strip() # and no quotes if PY2: searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) set_url = set_url + quote(api_value + '"' + searchterm + '"') elif api_value == 'inauthor:': searchterm = fullterm if authorname: searchterm = authorname # just search for author searchterm = searchterm.strip() if PY2: searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING) set_url = set_url + quote_plus(api_value + '"' + searchterm + '"') startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 try: while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urlencode(self.params) try: jsonresults, in_cache = gb_json_request(URL) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits += 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn('Found no results for %s with value: %s' % (api_value, searchterm)) break else: pass except Exception as err: if hasattr(err, 'reason'): errmsg = err.reason else: errmsg = str(err) logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % errmsg) break startindex += 40 for item in jsonresults['items']: total_count += 1 book = bookdict(item) if not book['author']: logger.debug('Skipped a result without authorfield.') no_author_count += 1 continue if not book['name']: logger.debug('Skipped a result without title.') continue valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG']) if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = book['lang'] if booklang not in valid_langs: logger.debug( 'Skipped %s with language %s' % (book['name'], booklang)) ignored += 1 continue except KeyError: ignored += 1 logger.debug('Skipped %s where no language is found' % book['name']) continue if authorname: author_fuzz = fuzz.ratio(book['author'], authorname) else: author_fuzz = fuzz.ratio(book['author'], fullterm) if title: book_fuzz = fuzz.token_set_ratio(book['name'], title) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(book['name'])) words -= len(getList(title)) book_fuzz -= abs(words) else: book_fuzz = fuzz.token_set_ratio(book['name'], fullterm) isbn_fuzz = 0 if is_valid_isbn(fullterm): isbn_fuzz = 100 highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz) dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(book['name'], dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace AuthorID = '' if book['author']: match = myDB.match( 'SELECT AuthorID FROM authors WHERE AuthorName=?', ( book['author'].replace('"', '""'),)) if match: AuthorID = match['AuthorID'] resultlist.append({ 'authorname': book['author'], 'authorid': AuthorID, 'bookid': item['id'], 'bookname': bookname, 'booksub': book['sub'], 'bookisbn': book['isbn'], 'bookpub': book['pub'], 'bookdate': book['date'], 'booklang': book['lang'], 'booklink': book['link'], 'bookrate': float(book['rate']), 'bookrate_count': book['rate_count'], 'bookimg': book['img'], 'bookpages': book['pages'], 'bookgenre': book['genre'], 'bookdesc': book['desc'], 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': book['ratings'] }) resultcount += 1 except KeyError: break logger.debug("Returning %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, searchterm)) logger.debug("Found %s result%s" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug('The Google Books API was hit %s time%s for searchterm: %s' % (api_hits, plural(api_hits), fullterm)) queue.put(resultlist) except Exception: logger.error('Unhandled exception in GB.find_results: %s' % traceback.format_exc())
def find_results(self, authorname=None, queue=None): try: myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(authorname): api_strings = ['isbn:'] api_hits = 0 logger.debug( 'Now searching Google Books API with keyword: ' + self.name) for api_value in api_strings: startindex = 0 if api_value == "isbn:": set_url = self.url + urllib.quote(api_value + self.name.encode(lazylibrarian.SYS_ENCODING)) else: set_url = self.url + \ urllib.quote(api_value + '"' + self.name.encode(lazylibrarian.SYS_ENCODING) + '"') try: startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = get_json_request(URL) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits = api_hits + 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn( 'Found no results for %s with value: %s' % (api_value, self.name)) break else: pass except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break startindex = startindex + 40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug('Skipped a result without authorfield.') no_author_count = no_author_count + 1 continue try: bookname = item['volumeInfo']['title'] except KeyError: logger.debug('Skipped a result without title.') continue valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = item['volumeInfo']['language'] if booklang not in valid_langs: logger.debug( 'Skipped %s with language %s' % (bookname, booklang)) ignored = ignored + 1 continue except KeyError: ignored = ignored + 1 logger.debug('Skipped %s where no language is found', bookname) continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' bookdate = bookdate[:4] try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: num_reviews = item['volumeInfo']['ratingsCount'] except KeyError: num_reviews = 0 try: if item['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 author_fuzz = fuzz.token_set_ratio(Author, authorname) book_fuzz = fuzz.token_set_ratio(bookname, authorname) isbn_fuzz = 0 if is_valid_isbn(authorname): isbn_fuzz = 100 highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) dic = {':': '', '"': '', '\'': ''} bookname = replace_all(bookname, dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace bookid = item['id'] author = myDB.select( 'SELECT AuthorID FROM authors WHERE AuthorName = "%s"' % Author.replace('"', '""')) if author: AuthorID = author[0]['authorid'] else: AuthorID = '' resultlist.append({ 'authorname': Author, 'authorid': AuthorID, 'bookid': bookid, 'bookname': bookname, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': num_reviews }) resultcount = resultcount + 1 except KeyError: break logger.debug("Found %s total result%s" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug("Showing %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, authorname)) logger.debug( 'The Google Books API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), self.name)) queue.put(resultlist) except Exception as e: logger.error('Unhandled exception in GB.find_results: %s' % traceback.format_exc())
def getSeriesAuthors(seriesid): """ Get a list of authors contributing to a series and import those authors (and their books) into the database Return how many authors you added """ myDB = database.DBConnection() result = myDB.match("select count(*) as counter from authors") start = int(result['counter']) result = myDB.match('select SeriesName from series where SeriesID=?', (seriesid,)) seriesname = result['SeriesName'] members = getSeriesMembers(seriesid) dic = {u'\u2018': "", u'\u2019': "", u'\u201c': '', u'\u201d': '', "'": "", '"': ''} if members: myDB = database.DBConnection() for member in members: # order = member[0] bookname = member[1] authorname = member[2] # workid = member[3] authorid = member[4] bookname = replace_all(bookname, dic) if not authorid: # goodreads gives us all the info we need, librarything/google doesn't base_url = 'https://www.goodreads.com/search.xml?q=' params = {"key": lazylibrarian.CONFIG['GR_API']} searchname = bookname + ' ' + authorname searchname = cleanName(unaccented(searchname)) if PY2: searchname = searchname.encode(lazylibrarian.SYS_ENCODING) searchterm = quote_plus(searchname) set_url = base_url + searchterm + '&' + urlencode(params) try: rootxml, in_cache = gr_xml_request(set_url) if rootxml is None: logger.warn('Error getting XML for %s' % searchname) else: resultxml = rootxml.getiterator('work') for item in resultxml: try: booktitle = item.find('./best_book/title').text booktitle = replace_all(booktitle, dic) except (KeyError, AttributeError): booktitle = "" book_fuzz = fuzz.token_set_ratio(booktitle, bookname) if book_fuzz >= 98: try: author = item.find('./best_book/author/name').text except (KeyError, AttributeError): author = "" # try: # workid = item.find('./work/id').text # except (KeyError, AttributeError): # workid = "" try: authorid = item.find('./best_book/author/id').text except (KeyError, AttributeError): authorid = "" logger.debug("Author Search found %s %s, authorid %s" % (author, booktitle, authorid)) break if not authorid: # try again with title only searchname = cleanName(unaccented(bookname)) if PY2: searchname = searchname.encode(lazylibrarian.SYS_ENCODING) searchterm = quote_plus(searchname) set_url = base_url + searchterm + '&' + urlencode(params) rootxml, in_cache = gr_xml_request(set_url) if rootxml is None: logger.warn('Error getting XML for %s' % searchname) else: resultxml = rootxml.getiterator('work') for item in resultxml: booktitle = item.find('./best_book/title').text booktitle = replace_all(booktitle, dic) book_fuzz = fuzz.token_set_ratio(booktitle, bookname) if book_fuzz >= 98: try: author = item.find('./best_book/author/name').text except (KeyError, AttributeError): author = "" # try: # workid = item.find('./work/id').text # except (KeyError, AttributeError): # workid = "" try: authorid = item.find('./best_book/author/id').text except (KeyError, AttributeError): authorid = "" logger.debug("Title Search found %s %s, authorid %s" % (author, booktitle, authorid)) break if not authorid: logger.warn("GoodReads doesn't know about %s %s" % (authorname, bookname)) except Exception as e: logger.error("Error finding goodreads results: %s %s" % (type(e).__name__, str(e))) if authorid: lazylibrarian.importer.addAuthorToDB(refresh=False, authorid=authorid) result = myDB.match("select count(*) as counter from authors") finish = int(result['counter']) newauth = finish - start logger.info("Added %s new author%s for %s" % (newauth, plural(newauth), seriesname)) return newauth
def find_results(self, authorname=None, queue=None): threading.currentThread().name = "GR-SEARCH" resultlist = [] api_hits = 0 # Goodreads doesn't like initials followed by spaces, # eg "M L Hamilton", needs "M. L. Hamilton" or "M.L.Hamilton" # but DOES need spaces if not initials eg "Tom.Holt" fails, but "Tom Holt" works if authorname[1] == ' ': authorname = authorname.replace(' ', '.') authorname = authorname.replace('..', '.') url = urllib.quote_plus(authorname.encode('utf-8')) set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params) logger.debug('Now searching GoodReads API with keyword: ' + authorname) logger.debug('Searching for %s at: %s' % (authorname, set_url)) try: try: rootxml, in_cache = self.get_request(set_url) except Exception, e: logger.error("Error finding results: " + str(e)) return resultxml = rootxml.getiterator('work') resultcount = 0 for author in resultxml: bookdate = "0001-01-01" if (author.find('original_publication_year').text is None): bookdate = "0000" else: bookdate = author.find('original_publication_year').text authorNameResult = author.find('./best_book/author/name').text booksub = "" bookpub = "" booklang = "Unknown" try: bookimg = author.find('./best_book/image_url').text if (bookimg == 'http://www.goodreads.com/assets/nocover/111x148.png'): bookimg = 'images/nocover.png' except KeyError: bookimg = 'images/nocover.png' except AttributeError: bookimg = 'images/nocover.png' try: bookrate = author.find('average_rating').text except KeyError: bookrate = 0 bookpages = '0' bookgenre = '' bookdesc = '' bookisbn = '' booklink = 'http://www.goodreads.com/book/show/' + author.find('./best_book/id').text if (author.find('./best_book/title').text is None): bookTitle = "" else: bookTitle = author.find('./best_book/title').text author_fuzz = fuzz.token_set_ratio(authorNameResult, authorname) book_fuzz = fuzz.token_set_ratio(bookTitle, authorname) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) resultlist.append({ 'authorname': author.find('./best_book/author/name').text, 'bookid': author.find('./best_book/id').text, 'authorid': author.find('./best_book/author/id').text, 'bookname': bookTitle.encode("ascii", "ignore"), 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': booklink, 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': float(bookrate) }) resultcount = resultcount + 1
def findBestResult(resultlist, book, searchtype, source): """ resultlist: collated results from search providers book: the book we want to find searchtype: book, magazine, shortbook, audiobook etc. source: nzb, tor, rss, direct return: highest scoring match, or None if no match """ # noinspection PyBroadException try: myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '.', ';': '', '\'': ''} if source == 'rss': author, title = get_searchterm(book, searchtype) else: author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) if book['library'] == 'AudioBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_AUDIO']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXAUDIO'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINAUDIO'], 0) auxinfo = 'AudioBook' else: # elif book['library'] == 'eBook': reject_list = getList(lazylibrarian.CONFIG['REJECT_WORDS']) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXSIZE'], 0) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINSIZE'], 0) auxinfo = 'eBook' if source == 'nzb': prefix = 'nzb' else: # rss and libgen return same names as torrents prefix = 'tor_' logger.debug('Searching %s %s results for best %s match' % (len(resultlist), source, auxinfo)) matches = [] for res in resultlist: resultTitle = unaccented_str(replace_all(res[prefix + 'title'], dictrepl)).strip() resultTitle = re.sub(r"\s\s+", " ", resultTitle) # remove extra whitespace Author_match = fuzz.token_set_ratio(author, resultTitle) Book_match = fuzz.token_set_ratio(title, resultTitle) if lazylibrarian.LOGLEVEL & lazylibrarian.log_fuzz: logger.debug("%s author/book Match: %s/%s %s at %s" % (source.upper(), Author_match, Book_match, resultTitle, res[prefix + 'prov'])) rejected = False url = res[prefix + 'url'] if url is None: rejected = True logger.debug("Rejecting %s, no URL found" % resultTitle) if not rejected and lazylibrarian.CONFIG['BLACKLIST_FAILED']: already_failed = myDB.match('SELECT * from wanted WHERE NZBurl=? and Status="Failed"', (url,)) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and lazylibrarian.CONFIG['BLACKLIST_PROCESSED']: already_failed = myDB.match('SELECT * from wanted WHERE NZBurl=?', (url,)) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (resultTitle, already_failed['NZBprov'])) rejected = True if not rejected and not url.startswith('http') and not url.startswith('magnet'): rejected = True logger.debug("Rejecting %s, invalid URL [%s]" % (resultTitle, url)) if not rejected: for word in reject_list: if word in getList(resultTitle.lower()) and word not in getList(author.lower()) \ and word not in getList(title.lower()): rejected = True logger.debug("Rejecting %s, contains %s" % (resultTitle, word)) break size_temp = check_int(res[prefix + 'size'], 1000) # Need to cater for when this is NONE (Issue 35) size = round(float(size_temp) / 1048576, 2) if not rejected and maxsize and size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % resultTitle) if not rejected and minsize and size < minsize: rejected = True logger.debug("Rejecting %s, too small" % resultTitle) if not rejected: bookid = book['bookid'] # newTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() # newTitle = resultTitle + ' LL.(' + book['bookid'] + ')' if source == 'nzb': mode = res['nzbmode'] # nzb, torznab else: mode = res['tor_type'] # torrent, magnet, nzb(from rss), direct controlValueDict = {"NZBurl": url} newValueDict = { "NZBprov": res[prefix + 'prov'], "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": size, "NZBtitle": resultTitle, "NZBmode": mode, "AuxInfo": auxinfo, "Status": "Skipped" } score = (Book_match + Author_match) / 2 # as a percentage # lose a point for each unwanted word in the title so we get the closest match # but for RSS ignore anything at the end in square braces [keywords, genres etc] if source == 'rss': wordlist = getList(resultTitle.rsplit('[', 1)[0].lower()) else: wordlist = getList(resultTitle.lower()) words = [x for x in wordlist if x not in getList(author.lower())] words = [x for x in words if x not in getList(title.lower())] typelist = '' if newValueDict['AuxInfo'] == 'eBook': words = [x for x in words if x not in getList(lazylibrarian.CONFIG['EBOOK_TYPE'])] typelist = getList(lazylibrarian.CONFIG['EBOOK_TYPE']) elif newValueDict['AuxInfo'] == 'AudioBook': words = [x for x in words if x not in getList(lazylibrarian.CONFIG['AUDIOBOOK_TYPE'])] typelist = getList(lazylibrarian.CONFIG['AUDIOBOOK_TYPE']) score -= len(words) # prioritise titles that include the ebook types we want # add more points for booktypes nearer the left in the list # eg if epub, mobi, pdf add 3 points if epub found, 2 for mobi, 1 for pdf booktypes = [x for x in wordlist if x in typelist] if booktypes: typelist = list(reversed(typelist)) for item in booktypes: for i in [i for i, x in enumerate(typelist) if x == item]: score += i + 1 matches.append([score, newValueDict, controlValueDict, res['priority']]) if matches: highest = max(matches, key=lambda s: (s[0], s[3])) score = highest[0] newValueDict = highest[1] # controlValueDict = highest[2] dlpriority = highest[3] if score < int(lazylibrarian.CONFIG['MATCH_RATIO']): logger.info('Nearest match (%s%%): %s using %s search for %s %s' % (score, newValueDict['NZBtitle'], searchtype, book['authorName'], book['bookName'])) else: logger.info('Best match (%s%%): %s using %s search, %s priority %s' % (score, newValueDict['NZBtitle'], searchtype, newValueDict['NZBprov'], dlpriority)) return highest else: logger.debug("No %s found for [%s] using searchtype %s" % (source, book["searchterm"], searchtype)) return None except Exception: logger.error('Unhandled exception in findBestResult: %s' % traceback.format_exc())
def searchItem(item=None, bookid=None, cat=None): """ Call all active search providers to search for item return a list of results, each entry in list containing percentage_match, title, provider, size, url item = searchterm to use for general search bookid = link to data for book/audio searches cat = category to search [general, book, audio] """ results = [] if not item: return results book = {} searchterm = unaccented_str(item) book['searchterm'] = searchterm if bookid: book['bookid'] = bookid else: book['bookid'] = searchterm if cat in ['book', 'audio']: myDB = database.DBConnection() cmd = 'SELECT authorName,bookName,bookSub from books,authors WHERE books.AuthorID=authors.AuthorID' cmd += ' and bookID=?' match = myDB.match(cmd, (bookid,)) if match: book['authorName'] = match['authorName'] book['bookName'] = match['bookName'] book['bookSub'] = match['bookSub'] else: logger.debug('Forcing general search') cat = 'general' nprov = lazylibrarian.USE_NZB() + lazylibrarian.USE_TOR() + lazylibrarian.USE_RSS() + lazylibrarian.USE_DIRECT() logger.debug('Searching %s provider%s (%s) for %s' % (nprov, plural(nprov), cat, searchterm)) if lazylibrarian.USE_NZB(): resultlist, nprov = IterateOverNewzNabSites(book, cat) if nprov: results += resultlist if lazylibrarian.USE_TOR(): resultlist, nprov = IterateOverTorrentSites(book, cat) if nprov: results += resultlist if lazylibrarian.USE_DIRECT(): resultlist, nprov = IterateOverDirectSites(book, cat) if nprov: results += resultlist if lazylibrarian.USE_RSS(): resultlist, nprov = IterateOverRSSSites() if nprov: results += resultlist # reprocess to get consistent results searchresults = [] for item in results: provider = '' title = '' url = '' size = '' date = '' mode = '' if 'nzbtitle' in item: title = item['nzbtitle'] if 'nzburl' in item: url = item['nzburl'] if 'nzbprov' in item: provider = item['nzbprov'] if 'nzbsize' in item: size = item['nzbsize'] if 'nzbdate' in item: date = item['nzbdate'] if 'nzbmode' in item: mode = item['nzbmode'] if 'tor_title' in item: title = item['tor_title'] if 'tor_url' in item: url = item['tor_url'] if 'tor_prov' in item: provider = item['tor_prov'] if 'tor_size' in item: size = item['tor_size'] if 'tor_date' in item: date = item['tor_date'] if 'tor_type' in item: mode = item['tor_type'] if title and provider and mode and url: # Not all results have a date or a size if not date: date = 'Fri, 01 Jan 1970 00:00:00 +0100' if not size: size = '1000' # calculate match percentage - torrents might have words_with_underscore_separator score = fuzz.token_set_ratio(searchterm, title.replace('_', ' ')) # lose a point for each extra word in the title so we get the closest match words = len(getList(searchterm)) words -= len(getList(title)) score -= abs(words) if score >= 40: # ignore wildly wrong results? result = {'score': score, 'title': title, 'provider': provider, 'size': size, 'date': date, 'url': quote_plus(url), 'mode': mode} searchresults.append(result) # from operator import itemgetter # searchresults = sorted(searchresults, key=itemgetter('score'), reverse=True) logger.debug('Found %s %s results for %s' % (len(searchresults), cat, searchterm)) return searchresults
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '.', ';': ''} match_ratio = int(lazylibrarian.CONFIG['MATCH_RATIO']) reject_list = getList(lazylibrarian.CONFIG['REJECT_WORDS']) author = unaccented_str(replace_all(book['authorName'], dic)) title = unaccented_str(replace_all(book['bookName'], dic)) matches = [] for tor in resultlist: torTitle = unaccented_str(tor['tor_title']) torTitle = replace_all(torTitle, dictrepl).strip() torTitle = re.sub(r"\s\s+", " ", torTitle) # remove extra whitespace torAuthor_match = fuzz.token_set_ratio(author, torTitle) torBook_match = fuzz.token_set_ratio(title, torTitle) logger.debug(u"TOR author/book Match: %s/%s for %s" % (torAuthor_match, torBook_match, torTitle)) tor_url = tor['tor_url'] rejected = False already_failed = myDB.match('SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % tor_url) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (torTitle, already_failed['NZBprov'])) rejected = True if not rejected: for word in reject_list: if word in torTitle.lower() and word not in author.lower() and word not in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (torTitle, word)) break tor_size_temp = tor['tor_size'] # Need to cater for when this is NONE (Issue 35) tor_size_temp = check_int(tor_size_temp, 1000) tor_size = round(float(tor_size_temp) / 1048576, 2) maxsize = check_int(lazylibrarian.CONFIG['REJECT_MAXSIZE'], 0) if not rejected: if maxsize and tor_size > maxsize: rejected = True logger.debug("Rejecting %s, too large" % torTitle) minsize = check_int(lazylibrarian.CONFIG['REJECT_MINSIZE'], 0) if not rejected: if minsize and tor_size < minsize: rejected = True logger.debug("Rejecting %s, too small" % torTitle) if not rejected: bookid = book['bookid'] tor_Title = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor['tor_prov'], "BookID": bookid, "NZBdate": now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } score = (torBook_match + torAuthor_match) / 2 # as a percentage # lose a point for each extra word in the title so we get the closest match words = len(getList(torTitle)) words -= len(getList(author)) words -= len(getList(title)) score -= abs(words) matches.append([score, torTitle, newValueDict, controlValueDict]) if matches: highest = max(matches, key=lambda x: x[0]) score = highest[0] nzb_Title = highest[1] newValueDict = highest[2] controlValueDict = highest[3] if score < match_ratio: logger.info(u'Nearest TOR match (%s%%): %s using %s search for %s %s' % (score, nzb_Title, searchtype, author, title)) return False logger.info(u'Best TOR match (%s%%): %s using %s search' % (score, nzb_Title, searchtype)) snatchedbooks = myDB.match('SELECT BookID from books WHERE BookID="%s" and Status="Snatched"' % newValueDict["BookID"]) if snatchedbooks: logger.debug('%s already marked snatched' % nzb_Title) return True # someone else found it, not us else: myDB.upsert("wanted", newValueDict, controlValueDict) if newValueDict["NZBprov"] == 'libgen': # for libgen we use direct download links snatch = DirectDownloadMethod(newValueDict["BookID"], newValueDict["NZBtitle"], controlValueDict["NZBurl"], nzb_Title) else: snatch = TORDownloadMethod(newValueDict["BookID"], newValueDict["NZBtitle"], controlValueDict["NZBurl"]) if snatch: logger.info('Downloading %s from %s' % (newValueDict["NZBtitle"], newValueDict["NZBprov"])) notify_snatch("%s from %s at %s" % (newValueDict["NZBtitle"], newValueDict["NZBprov"], now())) custom_notify_snatch(newValueDict["BookID"]) scheduleJob(action='Start', target='processDir') return True + True # we found it else: logger.debug("No torrent's found for [%s] using searchtype %s" % (book["searchterm"], searchtype)) return False
def getSeriesAuthors(seriesid): """ Get a list of authors contributing to a series and import those authors (and their books) into the database Return how many authors you added """ myDB = database.DBConnection() result = myDB.match("select count(*) as counter from authors") start = int(result['counter']) result = myDB.match('select SeriesName from series where SeriesID=?', (seriesid, )) seriesname = result['SeriesName'] members, api_hits = getSeriesMembers(seriesid, seriesname) dic = { u'\u2018': "", u'\u2019': "", u'\u201c': '', u'\u201d': '', "'": "", '"': '' } if members: myDB = database.DBConnection() for member in members: # order = member[0] bookname = member[1] authorname = member[2] # workid = member[3] authorid = member[4] # pubyear = member[5] bookname = replace_all(bookname, dic) if not authorid: # goodreads gives us all the info we need, librarything/google doesn't base_url = 'https://www.goodreads.com/search.xml?q=' params = {"key": lazylibrarian.CONFIG['GR_API']} searchname = bookname + ' ' + authorname searchname = cleanName(unaccented(searchname)) if PY2: searchname = searchname.encode(lazylibrarian.SYS_ENCODING) searchterm = quote_plus(searchname) set_url = base_url + searchterm + '&' + urlencode(params) try: rootxml, in_cache = gr_xml_request(set_url) if not in_cache: api_hits += 1 if rootxml is None: logger.warn('Error getting XML for %s' % searchname) else: resultxml = rootxml.getiterator('work') for item in resultxml: try: booktitle = item.find('./best_book/title').text booktitle = replace_all(booktitle, dic) except (KeyError, AttributeError): booktitle = "" book_fuzz = fuzz.token_set_ratio( booktitle, bookname) if book_fuzz >= 98: try: author = item.find( './best_book/author/name').text except (KeyError, AttributeError): author = "" # try: # workid = item.find('./work/id').text # except (KeyError, AttributeError): # workid = "" try: authorid = item.find( './best_book/author/id').text except (KeyError, AttributeError): authorid = "" logger.debug( "Author Search found %s %s, authorid %s" % (author, booktitle, authorid)) break if not authorid: # try again with title only searchname = cleanName(unaccented(bookname)) if PY2: searchname = searchname.encode( lazylibrarian.SYS_ENCODING) searchterm = quote_plus(searchname) set_url = base_url + searchterm + '&' + urlencode( params) rootxml, in_cache = gr_xml_request(set_url) if not in_cache: api_hits += 1 if rootxml is None: logger.warn('Error getting XML for %s' % searchname) else: resultxml = rootxml.getiterator('work') for item in resultxml: booktitle = item.find('./best_book/title').text booktitle = replace_all(booktitle, dic) book_fuzz = fuzz.token_set_ratio( booktitle, bookname) if book_fuzz >= 98: try: author = item.find( './best_book/author/name').text except (KeyError, AttributeError): author = "" # try: # workid = item.find('./work/id').text # except (KeyError, AttributeError): # workid = "" try: authorid = item.find( './best_book/author/id').text except (KeyError, AttributeError): authorid = "" logger.debug( "Title Search found %s %s, authorid %s" % (author, booktitle, authorid)) break if not authorid: logger.warn("GoodReads doesn't know about %s %s" % (authorname, bookname)) except Exception as e: logger.error("Error finding goodreads results: %s %s" % (type(e).__name__, str(e))) if authorid: lazylibrarian.importer.addAuthorToDB(refresh=False, authorid=authorid) result = myDB.match("select count(*) as counter from authors") finish = int(result['counter']) newauth = finish - start logger.info("Added %s new author%s for %s" % (newauth, plural(newauth), seriesname)) return newauth
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab, rss threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "SEARCHMAG" myDB = database.DBConnection() searchlist = [] if mags is None: # backlog search searchmags = myDB.select('SELECT Title, LastAcquired, \ IssueDate from magazines WHERE Status="Active"' ) else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select( 'SELECT Title, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 0: return # should clear old search results as might not be available any more # ie torrent not available, changed providers, out of news server retention etc. # Only delete the "skipped" ones, not wanted/snatched/processed/ignored logger.debug(u"Removing old magazine search results") myDB.action('DELETE from pastissues WHERE Status="Skipped"') logger.info('Searching for %i magazine%s' % (len(searchmags), plural(len(searchmags)))) for searchmag in searchmags: bookid = searchmag['Title'] searchterm = searchmag['Title'] # frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = { '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '' } searchterm = unaccented_str(replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode(lazylibrarian.SYS_ENCODING) searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn( 'There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn( 'No nzb providers are set. Check config for NEWZNAB or TORZNAB providers' ) if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn( 'No torrent providers are set. Check config for TORRENT providers' ) if tor_resultlist: for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if lazylibrarian.USE_RSS(): rss_resultlist, nproviders = IterateOverRSSSites(book, 'mag') if not nproviders: logger.warn( 'No rss providers are set. Check config for RSS providers') if rss_resultlist: for item in rss_resultlist: # reformat the rss results so they look like nzbs resultlist.append({ 'bookid': book['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': item[ 'tor_date'], # may be fake date as none returned from rss torrents, only rss nzb 'nzbsize': item['tor_size'], 'nzbmode': item['tor_type'] }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 maglist = [] issues = [] for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = unaccented_str(nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace( "'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = round(float(nzbsize_temp) / 1048576, 2) nzbdate = nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] results = myDB.match( 'SELECT * from magazines WHERE Title="%s"' % bookid) if not results: logger.debug( 'Magazine [%s] does not match search term [%s].' % (nzbtitle, bookid)) bad_regex = bad_regex + 1 else: control_date = results['IssueDate'] reject_list = getList(results['Regex']) dic = { '.': ' ', '-': ' ', '/': ' ', '+': ' ', '_': ' ', '(': '', ')': '' } nzbtitle_formatted = replace_all(nzbtitle, dic).strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join( nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date rejected = False if len(nzbtitle_exploded) > len( bookid_exploded ): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio( unaccented(bookid), unaccented(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug(u"Magazine token set Match failed: " + str(mag_title_match) + "% for " + nzbtitle_formatted) rejected = True else: rejected = True if not rejected: already_failed = myDB.match( 'SELECT * from wanted WHERE NZBurl="%s" and Status="Failed"' % nzburl) if already_failed: logger.debug("Rejecting %s, blacklisted at %s" % (nzbtitle_formatted, already_failed['NZBprov'])) rejected = True if not rejected: lower_title = unaccented(nzbtitle_formatted).lower() lower_bookid = unaccented(bookid).lower() for word in reject_list: if word in lower_title and word not in lower_bookid: rejected = True logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break # maxsize = formatter.check_int(lazylibrarian.REJECT_MAXSIZE, 0) # if maxsize and nzbsize > maxsize: # rejected = True # logger.debug("Rejecting %s, too large" % nzbtitle_formatted) if not rejected: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() in ['pdf', 'true', 'truepdf']: nzbtitle_exploded.pop( ) # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY regexA_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] regexA_month = month2num( unaccented(regexA_month_temp)) if not regexA_year.isdigit() or int( regexA_year) < 1900 or int( regexA_year) > 2100: regexA_year = 'fail' # force date failure # if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 3].rstrip(',').zfill(2) if regexA_day.isdigit(): if int(regexA_day ) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY # else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY or MonthName DD, YYYY regexB_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] regexB_month = month2num( unaccented(regexB_month_temp)) regexB_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 2].rstrip(',').zfill(2) if not regexB_year.isdigit() or int( regexB_year) < 1900 or int( regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date( int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] if regexC_year.isdigit( ) and int(regexC_year) > 1900 and int( regexC_year) < 2100: regexC_month = nzbtitle_exploded[ len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] regexC_month = 0 regexC_day = 0 if regexC_year.isdigit( ) and int(regexC_year) > 1900 and int( regexC_year) < 2100: regexC_month = nzbtitle_exploded[ len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[ len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date( int(regexC_year), int(regexC_month), int(regexC_day)) except Exception: # regexD Issue/No/Vol nn, YYYY or Issue/No/Vol nn try: IssueLabel = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] if IssueLabel.lower() in [ "issue", "no", "vol" ]: # issue nn regexD_issue = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] if regexD_issue.isdigit(): newdatish = str( int(regexD_issue) ) # 4 == 04 == 004 else: IssueLabel = nzbtitle_exploded[ len(nzbtitle_exploded) - 3] if IssueLabel.lower() in [ "issue", "no", "vol" ]: # issue nn, YYYY regexD_issue = nzbtitle_exploded[ len(nzbtitle_exploded) - 2] regexD_issue = regexD_issue.strip( ',') if regexD_issue.isdigit(): newdatish = str( int(regexD_issue) ) # 4 == 04 == 004 else: raise ValueError regexD_year = nzbtitle_exploded[ len(nzbtitle_exploded) - 1] if regexD_year.isdigit(): if int( regexD_year ) < int(datetime.date. today().year): newdatish = 0 # it's old else: raise ValueError except Exception: logger.debug( 'Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: logger.debug( 'Magazine [%s] does not match the search term [%s].' % (nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 continue # wanted issues go into wanted table marked "Wanted" # the rest into pastissues table marked "Skipped" insert_table = "pastissues" insert_status = "Skipped" if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc # or for magazines with only an issue number, use zero if '-' in str(newdatish): start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime( "%Y-%m-%d", time.localtime(start_time)) else: control_date = 0 if '-' in str(control_date) and '-' in str(newdatish): # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = datecompare(newdatish, control_date) elif '-' not in str(control_date) and '-' not in str( newdatish): # for issue numbers, check if later than last one we have comp_date = int(newdatish) - int(control_date) newdatish = "%s" % newdatish newdatish = newdatish.zfill( 4) # pad so we sort correctly else: # invalid comparison of date and issue number logger.debug( 'Magazine %s incorrect date or issue format.' % nzbtitle_formatted) bad_date = bad_date + 1 newdatish = "1970-01-01" # this is our fake date for ones we can't decipher comp_date = 0 if comp_date > 0: # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug( 'This issue of %s is new, downloading' % nzbtitle_formatted) issues.append(issue) insert_table = "wanted" insert_status = "Wanted" nzbdate = now() # when we asked for it else: logger.debug( 'This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug( 'This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 # store only the _new_ matching results # Don't add a new entry if this issue has been found on an earlier search # and status has been user-set ( we only delete the "Skipped" ones ) # In "wanted" table it might be already snatched/downloading/processing mag_entry = myDB.select( 'SELECT * from %s WHERE NZBtitle="%s" and NZBprov="%s"' % (insert_table, nzbtitle, nzbprov)) if not mag_entry: controlValueDict = { "NZBtitle": nzbtitle, "NZBprov": nzbprov } newValueDict = { "NZBurl": nzburl, "BookID": bookid, "NZBdate": nzbdate, "AuxInfo": newdatish, "Status": insert_status, "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert(insert_table, newValueDict, controlValueDict) else: # logger.debug('Magazine [%s] was rejected.' % nzbtitle_formatted) bad_regex = bad_regex + 1 logger.info( 'Found %i result%s for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % (total_nzbs, plural(total_nzbs), bookid, new_date, old_date, bad_date, bad_regex, len(maglist))) for magazine in maglist: if magazine['nzbmode'] in ["torznab", "torrent", "magnet"]: snatch = TORDownloadMethod(magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) else: snatch = NZBDownloadMethod(magazine['bookid'], magazine['nzbprov'], magazine['nzbtitle'], magazine['nzburl']) if snatch: logger.info('Downloading %s from %s' % (magazine['nzbtitle'], magazine["nzbprov"])) notify_snatch("%s from %s at %s" % (unaccented( magazine['nzbtitle']), magazine["nzbprov"], now())) scheduleJob(action='Start', target='processDir') maglist = [] if reset: scheduleJob(action='Restart', target='search_magazines') logger.info("Search for magazines complete")
def processResultList(resultlist, author, title, book): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} # ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', # ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) # bit of a misnomer now, rss can search both tor and nzb rss feeds for tor in resultlist: tor_Title = formatter.latinToAscii(formatter.replace_all(tor['tor_title'], dictrepl)).strip() tor_Title = re.sub(r"\s\s+", " ", tor_Title) # remove extra whitespace tor_Author_match = fuzz.token_set_ratio(author, tor_Title) tor_Title_match = fuzz.token_set_ratio(title, tor_Title) logger.debug("RSS Author/Title Match: %s/%s for %s" %(tor_Author_match, tor_Title_match, tor_Title)) rejected = False for word in reject_list: if word in tor_Title.lower() and not word in author.lower() and not word in book.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (tor_Title, word)) break if (tor_Title_match >= match_ratio and tor_Author_match >= match_ratio and not rejected): logger.debug(u'Found RSS: %s' % tor['tor_title']) bookid = book['bookid'] tor_Title = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() tor_url = tor['tor_url'] tor_prov = tor['tor_prov'] tor_feed = tor['tor_feed'] tor_size_temp = tor['tor_size'] # Need to cater for when this is NONE (Issue 35) if tor_size_temp is None: tor_size_temp = 1000 tor_size = str(round(float(tor_size_temp) / 1048576, 2)) + ' MB' controlValueDict = {"NZBurl": tor_url} newValueDict = { "NZBprov": tor_prov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": tor_size, "NZBtitle": tor_Title, "NZBmode": "torrent", "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone() if not snatchedbooks: # check if one of the other downloaders got there first if '.nzb' in tor_url: snatch = NZBDownloadMethod(bookid, tor_prov, tor_Title, tor_url) else: # http://baconbits.org/torrents.php?action=download&authkey=<authkey>&torrent_pass=<password.hashed>&id=185398 if not tor_url.startswith('magnet'): # magnets don't use auth pwd = lazylibrarian.RSS_PROV[tor_feed]['PASS'] auth = lazylibrarian.RSS_PROV[tor_feed]['AUTH'] # don't know what form of password hash is required, try sha1 tor_url = tor_url.replace('<authkey>', auth).replace('<password.hashed>', sha1(pwd)) snatch = TORDownloadMethod(bookid, tor_prov, tor_Title, tor_url) if snatch: notifiers.notify_snatch(formatter.latinToAscii(tor_Title) + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') return True logger.debug("No RSS found for " + (book["authorName"] + ' ' + book['bookName']).strip()) return False
def find_results(self, searchterm=None, queue=None): """ GoogleBooks performs much better if we search for author OR title not both at once, so if searchterm is not isbn, two searches needed. Lazylibrarian searches use <ll> to separate title from author in searchterm If this token isn't present, it's an isbn or searchterm as supplied by user """ try: myDB = database.DBConnection() resultlist = [] # See if we should check ISBN field, otherwise ignore it api_strings = ['inauthor:', 'intitle:'] if is_valid_isbn(searchterm): api_strings = ['isbn:'] api_hits = 0 ignored = 0 total_count = 0 no_author_count = 0 title = '' authorname = '' if ' <ll> ' in searchterm: # special token separates title from author title, authorname = searchterm.split(' <ll> ') fullterm = searchterm.replace(' <ll> ', ' ') logger.debug('Now searching Google Books API with searchterm: %s' % fullterm) for api_value in api_strings: set_url = self.url if api_value == "isbn:": set_url = set_url + quote(api_value + searchterm) elif api_value == 'intitle:': searchterm = fullterm if title: # just search for title # noinspection PyUnresolvedReferences title = title.split(' (')[0] # without any series info searchterm = title searchterm = searchterm.replace("'", "").replace( '"', '').strip() # and no quotes if PY2: searchterm = searchterm.encode( lazylibrarian.SYS_ENCODING) set_url = set_url + quote(api_value + '"' + searchterm + '"') elif api_value == 'inauthor:': searchterm = fullterm if authorname: searchterm = authorname # just search for author searchterm = searchterm.strip() if PY2: searchterm = searchterm.encode( lazylibrarian.SYS_ENCODING) set_url = set_url + quote_plus(api_value + '"' + searchterm + '"') startindex = 0 resultcount = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 try: while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urlencode(self.params) try: jsonresults, in_cache = gb_json_request(URL) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits += 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn( 'Found no results for %s with value: %s' % (api_value, searchterm)) break else: pass except Exception as err: if hasattr(err, 'reason'): errmsg = err.reason else: errmsg = str(err) logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % errmsg) break startindex += 40 for item in jsonresults['items']: total_count += 1 book = bookdict(item) if not book['author']: logger.debug( 'Skipped a result without authorfield.') no_author_count += 1 continue if not book['name']: logger.debug('Skipped a result without title.') continue valid_langs = getList( lazylibrarian.CONFIG['IMP_PREFLANG']) if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = book['lang'] if booklang not in valid_langs: logger.debug( 'Skipped %s with language %s' % (book['name'], booklang)) ignored += 1 continue except KeyError: ignored += 1 logger.debug( 'Skipped %s where no language is found' % book['name']) continue if authorname: author_fuzz = fuzz.ratio( book['author'], authorname) else: author_fuzz = fuzz.ratio( book['author'], fullterm) if title: book_fuzz = fuzz.token_set_ratio( book['name'], title) # lose a point for each extra word in the fuzzy matches so we get the closest match words = len(getList(book['name'])) words -= len(getList(title)) book_fuzz -= abs(words) else: book_fuzz = fuzz.token_set_ratio( book['name'], fullterm) isbn_fuzz = 0 if is_valid_isbn(fullterm): isbn_fuzz = 100 highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz) dic = {':': '.', '"': '', '\'': ''} bookname = replace_all(book['name'], dic) bookname = unaccented(bookname) bookname = bookname.strip() # strip whitespace AuthorID = '' if book['author']: match = myDB.match( 'SELECT AuthorID FROM authors WHERE AuthorName=?', (book['author'].replace('"', '""'), )) if match: AuthorID = match['AuthorID'] resultlist.append({ 'authorname': book['author'], 'authorid': AuthorID, 'bookid': item['id'], 'bookname': bookname, 'booksub': book['sub'], 'bookisbn': book['isbn'], 'bookpub': book['pub'], 'bookdate': book['date'], 'booklang': book['lang'], 'booklink': book['link'], 'bookrate': float(book['rate']), 'bookrate_count': book['rate_count'], 'bookimg': book['img'], 'bookpages': book['pages'], 'bookgenre': book['genre'], 'bookdesc': book['desc'], 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': book['ratings'] }) resultcount += 1 except KeyError: break logger.debug( "Returning %s result%s for (%s) with keyword: %s" % (resultcount, plural(resultcount), api_value, searchterm)) logger.debug("Found %s result%s" % (total_count, plural(total_count))) logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored))) logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count))) logger.debug( 'The Google Books API was hit %s time%s for searchterm: %s' % (api_hits, plural(api_hits), fullterm)) queue.put(resultlist) except Exception: logger.error('Unhandled exception in GB.find_results: %s' % traceback.format_exc())
def search_magazines(mags=None, reset=False): # produce a list of magazines to search for, tor, nzb, torznab myDB = database.DBConnection() searchlist = [] threading.currentThread().name = "SEARCHMAGS" if mags is None: # backlog search searchmags = myDB.select('SELECT Title, LastAcquired, \ IssueDate from magazines WHERE Status="Active"') else: searchmags = [] for magazine in mags: searchmags_temp = myDB.select('SELECT Title, LastAcquired, IssueDate from magazines \ WHERE Title="%s" AND Status="Active"' % (magazine['bookid'])) for terms in searchmags_temp: searchmags.append(terms) if len(searchmags) == 1: logger.info('Searching for one magazine') else: logger.info('Searching for %i magazines' % len(searchmags)) for searchmag in searchmags: bookid = searchmag[0] searchterm = searchmag[0] # frequency = searchmag[1] # last_acquired = searchmag[2] # issue_date = searchmag[3] dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''} searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic)) searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8') searchlist.append({"bookid": bookid, "searchterm": searchterm}) if searchlist == []: logger.warn('There is nothing to search for. Mark some magazines as active.') for book in searchlist: resultlist = [] tor_resultlist = [] if lazylibrarian.USE_NZB(): resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag') if not nproviders: logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers') if lazylibrarian.USE_TOR(): tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag') if not nproviders: logger.warn('No torrent providers are set. Check config for TORRENT providers') for item in tor_resultlist: # reformat the torrent results so they look like nzbs resultlist.append({ 'bookid': item['bookid'], 'nzbprov': item['tor_prov'], 'nzbtitle': item['tor_title'], 'nzburl': item['tor_url'], 'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100', # fake date as none returned from torrents 'nzbsize': item['tor_size'], 'nzbmode': 'torrent' }) if not resultlist: logger.debug("Adding magazine %s to queue." % book['searchterm']) else: bad_regex = 0 bad_date = 0 old_date = 0 total_nzbs = 0 new_date = 0 to_snatch = 0 maglist = [] issues = [] reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for nzb in resultlist: total_nzbs = total_nzbs + 1 bookid = nzb['bookid'] nzbtitle = (u'%s' % nzb['nzbtitle']) nzbtitle = nzbtitle.replace('"', '').replace("'", "") # suppress " in titles nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] if nzbsize_temp is None: # not all torrents returned by torznab have a size nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid) if checkifmag: for results in checkifmag: control_date = results['IssueDate'] # frequency = results['Frequency'] # regex = results['Regex'] nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace( '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip() # Need to make sure that substrings of magazine titles don't get found # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this # keyword_check = nzbtitle_formatted.replace(bookid, '') # remove extra spaces if they're in a row nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split()) nzbtitle_exploded = nzbtitle_exploded_temp.split(' ') if ' ' in bookid: bookid_exploded = bookid.split(' ') else: bookid_exploded = [bookid] # check nzb starts with magazine title, and ends with a date # eg The MagPI Issue 22 - July 2015 # do something like check left n words match title # then check last n words are a date name_match = 1 # assume name matches for now if len(nzbtitle_exploded) > len(bookid_exploded): # needs to be longer as it has to include a date # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz mag_title_match = fuzz.token_set_ratio(common.remove_accents(bookid), common.remove_accents(nzbtitle_formatted)) if mag_title_match < lazylibrarian.MATCH_RATIO: logger.debug(u"Magazine token set Match failed: " + str(mag_title_match) + "% for " + nzbtitle_formatted) name_match = 0 lower_title = common.remove_accents(nzbtitle_formatted).lower() lower_bookid = common.remove_accents(bookid).lower() for word in reject_list: if word in lower_title and not word in lower_bookid: name_match = 0 logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word)) break if name_match: # some magazine torrent uploaders add their sig in [] or {} # Fortunately for us, they always seem to add it at the end # also some magazine torrent titles are "magazine_name some_form_of_date pdf" # so strip all the trailing junk... while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \ nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf': nzbtitle_exploded.pop() # gotta love the function names # need at least one word magazine title and two date components if len(nzbtitle_exploded) > 2: # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn MonthName YYYY regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2] regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp)) if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100: regexA_year = 'fail' # force date failure #if frequency == "Weekly" or frequency == "BiWeekly": regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2) if regexA_day.isdigit(): if int(regexA_day) > 31: # probably issue number nn regexA_day = '01' else: regexA_day = '01' # just MonthName YYYY #else: # regexA_day = '01' # monthly, or less frequent try: newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day # try to make sure the year/month/day are valid, exception if not # ie don't accept day > 31, or 30 in some months # also handles multiple date format named issues eg Jan 2014, 01 2014 # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day)) except ValueError: # regexB = MonthName DD YYYY regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1] regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3] regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp)) regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100: regexB_year = 'fail' try: newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day)) except ValueError: # regexC = YYYY MM or YYYY MM DD # (can't get MM/DD if named YYYY Issue nn) # First try YYYY MM regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) regexC_day = '01' else: # try YYYY MM DD regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3] if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100: regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2) regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2) else: regexC_year = 'fail' try: newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day # datetime will give a ValueError if not a good date or a param is not int date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day)) except: logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted) bad_date = bad_date + 1 # allow issues with good name but bad date to be included # so user can manually select them, incl those with issue numbers newdatish = "1970-01-01" # provide a fake date for bad-date issues # continue else: continue if control_date is None: # we haven't got any copies of this magazine yet # get a rough time just over a month ago to compare to, in format yyyy-mm-dd # could perhaps calc differently for weekly, biweekly etc start_time = time.time() start_time -= 31 * 24 * 60 * 60 # number of seconds in 31 days control_date = time.strftime("%Y-%m-%d", time.localtime(start_time)) # only grab a copy if it's newer than the most recent we have, # or newer than a month ago if we have none comp_date = formatter.datecompare(newdatish, control_date) if comp_date > 0: # Should probably only upsert when downloaded and processed in case snatch fails # keep track of what we're going to download so we don't download dupes new_date = new_date + 1 issue = bookid + ',' + newdatish if issue not in issues: maglist.append({ 'bookid': bookid, 'nzbprov': nzbprov, 'nzbtitle': nzbtitle, 'nzburl': nzburl, 'nzbmode': nzbmode }) logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted) to_snatch = to_snatch + 1 issues.append(issue) controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBtitle": nzbtitle, "AuxInfo": newdatish, "Status": "Wanted", "NZBsize": nzbsize, "NZBmode": nzbmode } myDB.upsert("wanted", newValueDict, controlValueDict) else: logger.debug('This issue of %s is already flagged for download' % issue) else: if newdatish != "1970-01-01": # this is our fake date for ones we can't decipher logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted) old_date = old_date + 1 else: logger.debug('Magazine [%s] does not completely match search term [%s].' % ( nzbtitle_formatted, bookid)) bad_regex = bad_regex + 1 logger.info('Found %s results for %s. %s are new, %s are old, %s fail date, %s fail name matching' % ( total_nzbs, bookid, new_date, old_date, bad_date, bad_regex)) logger.info("%s, %s issues to download" % (bookid, to_snatch)) for items in maglist: if items['nzbmode'] == "torznab": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) elif items['nzbmode'] == "torrent": snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) else: snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl']) if snatch: notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') maglist = [] if reset == True: common.schedule_job(action='Restart', target='search_magazines') logger.info("Search for magazines complete")
def processDir(reset=False): threadname = threading.currentThread().name if "Thread-" in threadname: threading.currentThread().name = "POSTPROCESS" if not lazylibrarian.DOWNLOAD_DIR or not os.path.isdir(lazylibrarian.DOWNLOAD_DIR): processpath = os.getcwd() else: processpath = lazylibrarian.DOWNLOAD_DIR logger.debug(' Checking [%s] for files to post process' % processpath) try: downloads = os.listdir(processpath) except OSError as why: logger.error('Could not access [%s] directory [%s]' % (processpath, why.strerror)) return myDB = database.DBConnection() snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if len(snatched) == 0: logger.info('Nothing marked as snatched.') scheduleJob(action='Stop', target='processDir') return if len(downloads) == 0: logger.info('No downloads are found. Nothing to process.') return logger.info("Checking %s download%s for %s snatched file%s" % (len(downloads), plural(len(downloads)), len(snatched), plural(len(snatched)))) ppcount = 0 for book in snatched: matches = [] for fname in downloads: if not fname.endswith('.fail'): # has this failed before? # this is to get round differences in torrent filenames. # Torrents aren't always returned with the name we searched for # there might be a better way... if isinstance(fname, str): matchname = fname.decode(lazylibrarian.SYS_ENCODING) else: matchname = fname if ' LL.(' in matchname: matchname = matchname.split(' LL.(')[0] matchtitle = book['NZBtitle'] match = 0 if matchtitle: if ' LL.(' in matchtitle: matchtitle = matchtitle.split(' LL.(')[0] match = fuzz.token_set_ratio(matchtitle, matchname) if match >= lazylibrarian.DLOAD_RATIO: fname = matchname if os.path.isfile(os.path.join(processpath, fname)): # handle single file downloads here... if is_valid_booktype(fname, booktype="book") \ or is_valid_booktype(fname, booktype="mag"): dirname = os.path.join(processpath, os.path.splitext(fname)[0]) if not os.path.exists(dirname): try: os.makedirs(dirname) except OSError as why: logger.debug('Failed to create directory %s, %s' % (dirname, why.strerror)) if os.path.exists(dirname): try: shutil.move(os.path.join(processpath, fname), os.path.join(dirname, fname)) fname = os.path.splitext(fname)[0] except Exception as why: logger.debug("Failed to move file %s to %s, %s" % (fname, dirname, str(why))) if os.path.isdir(os.path.join(processpath, fname)): pp_path = os.path.join(processpath, fname) logger.debug('Found folder (%s%%) %s for %s' % (match, pp_path, book['NZBtitle'])) matches.append([match, pp_path, book]) else: logger.debug('No match (%s%%) %s for %s' % (match, matchname, matchtitle)) else: logger.debug('Skipping %s' % fname) if matches: highest = max(matches, key=lambda x: x[0]) match = highest[0] pp_path = highest[1] book = highest[2] logger.info(u'Best match (%s%%): %s for %s' % (match, pp_path, book['NZBtitle'])) data = myDB.select('SELECT * from books WHERE BookID="%s"' % book['BookID']) if data: authorname = data[0]['AuthorName'] bookname = data[0]['BookName'] if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER: logger.warn('Please check your EBOOK_DEST_FOLDER setting') lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\') # Default destination path, should be allowed change per config file. dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace( '$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace( '$Title', bookname) global_name = unaccented(global_name) # dest_path = authorname+'/'+bookname # global_name = bookname + ' - ' + authorname # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} dest_path = unaccented_str(replace_all(dest_path, dic)) dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode( lazylibrarian.SYS_ENCODING) else: data = myDB.select('SELECT * from magazines WHERE Title="%s"' % book['BookID']) if data: # AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple # files are downloading, there will be an error in post-processing, trying to go to the # same directory. mostrecentissue = data[0]['IssueDate'] # keep for processing issues arriving out of order # Remove characters we don't want in the filename before (maybe) adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} mag_name = unaccented_str(replace_all(book['BookID'], dic)) # book auxinfo is a cleaned date, eg 2015-01-01 dest_path = lazylibrarian.MAG_DEST_FOLDER.replace( '$IssueDate', book['AuxInfo']).replace('$Title', mag_name) # dest_path = '_Magazines/'+title+'/'+book['AuxInfo'] if lazylibrarian.MAG_RELATIVE: if dest_path[0] not in '._': dest_path = '_' + dest_path dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode( lazylibrarian.SYS_ENCODING) else: dest_path = dest_path.encode(lazylibrarian.SYS_ENCODING) authorname = None bookname = None global_name = lazylibrarian.MAG_DEST_FILE.replace('$IssueDate', book['AuxInfo']).replace( '$Title', mag_name) global_name = unaccented(global_name) # global_name = book['AuxInfo']+' - '+title else: logger.debug("Snatched magazine %s is not in download directory" % (book['BookID'])) continue else: logger.debug("Snatched %s %s is not in download directory" % (book['NZBmode'], book['NZBtitle'])) continue processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name) if processBook: logger.debug("Processing %s, %s" % (global_name, book['NZBurl'])) # update nzbs, only update the snatched ones in case multiple matches for same book / magazine issue controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = {"Status": "Processed", "NZBDate": now()} # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) if bookname is not None: # it's a book, if None it's a magazine if len(lazylibrarian.IMP_CALIBREDB): logger.debug('Calibre should have created the extras for us') else: processExtras(myDB, dest_path, global_name, data) else: # update mags controlValueDict = {"Title": book['BookID']} if mostrecentissue: if mostrecentissue.isdigit() and str(book['AuxInfo']).isdigit(): older = int(mostrecentissue) > int(book['AuxInfo']) # issuenumber else: older = mostrecentissue > book['AuxInfo'] # YYYY-MM-DD else: older = False if older: # check this in case processing issues arriving out of order newValueDict = {"LastAcquired": today(), "IssueStatus": "Open"} else: newValueDict = {"IssueDate": book['AuxInfo'], "LastAcquired": today(), "IssueStatus": "Open"} myDB.upsert("magazines", newValueDict, controlValueDict) # dest_path is where we put the magazine after processing, but we don't have the full filename # so look for any "book" in that directory dest_file = book_file(dest_path, booktype='mag') controlValueDict = {"Title": book['BookID'], "IssueDate": book['AuxInfo']} newValueDict = {"IssueAcquired": today(), "IssueFile": dest_file, "IssueID": create_id("%s %s" % (book['BookID'], book['AuxInfo'])) } myDB.upsert("issues", newValueDict, controlValueDict) # create a thumbnail cover for the new issue create_cover(dest_file) logger.info('Successfully processed: %s' % global_name) ppcount = ppcount + 1 notify_download("%s at %s" % (global_name, now())) else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s.fail' % pp_path) controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = {"Status": "Failed", "NZBDate": now()} myDB.upsert("wanted", newValueDict, controlValueDict) # if it's a book, reset status so we try for a different version # if it's a magazine, user can select a different one from pastissues table if bookname is not None: myDB.action('UPDATE books SET status = "Wanted" WHERE BookID="%s"' % book['BookID']) # at this point, as it failed we should move it or it will get postprocessed # again (and fail again) try: os.rename(pp_path, pp_path + '.fail') except: logger.debug("Unable to rename %s" % pp_path) downloads = os.listdir(processpath) # check in case we processed/deleted some above for directory in downloads: if "LL.(" in directory and not directory.endswith('.fail'): bookID = str(directory).split("LL.(")[1].split(")")[0] logger.debug("Book with id: " + str(bookID) + " is in downloads") pp_path = os.path.join(processpath, directory) if os.path.isfile(pp_path): pp_path = os.path.join(processpath) if (os.path.isdir(pp_path)): logger.debug('Found LL folder %s.' % pp_path) if import_book(pp_path, bookID): ppcount = ppcount + 1 if ppcount == 0: logger.info('No snatched books/mags have been found') else: logger.info('%s book%s/mag%s processed.' % (ppcount, plural(ppcount), plural(ppcount))) if reset: scheduleJob(action='Restart', target='processDir')
def find_results(self, authorname=None, queue=None): threading.currentThread().name = "GB-SEARCH" resultlist = [] # See if we should check ISBN field, otherwise ignore it try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): api_strings = ['isbn:'] else: api_strings = ['inauthor:', 'intitle:'] except: api_strings = ['inauthor:', 'intitle:'] api_hits = 0 logger.debug( 'Now searching Google Books API with keyword: ' + self.name) for api_value in api_strings: startindex = 0 if api_value == "isbn:": set_url = self.url + urllib.quote(api_value + self.name.encode('utf-8')) else: set_url = self.url + \ urllib.quote(api_value + '"' + self.name.encode('utf-8') + '"') try: startindex = 0 resultcount = 0 # removedResults = 0 ignored = 0 number_results = 1 total_count = 0 no_author_count = 0 while startindex < number_results: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults, in_cache = self.get_request(URL) if jsonresults is None: number_results = 0 else: if not in_cache: api_hits = api_hits + 1 number_results = jsonresults['totalItems'] logger.debug('Searching url: ' + URL) if number_results == 0: logger.warn( 'Found no results for %s with value: %s' % (api_value, self.name)) break else: pass except HTTPError as err: logger.warn( 'Google Books API Error [%s]: Check your API key or wait a while' % err.reason) break startindex = startindex + 40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug( 'Skipped a result without authorfield.') no_author_count = no_author_count + 1 continue valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')]) if "All" not in valid_langs: # don't care about languages, accept all try: # skip if language is not in valid list - booklang = item['volumeInfo']['language'] if booklang not in valid_langs: logger.debug( 'Skipped a book with language %s' % booklang) ignored = ignored + 1 continue except KeyError: ignored = ignored + 1 logger.debug( 'Skipped a result where no language is found') continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' bookdate = bookdate[:4] try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: num_reviews = item['volumeInfo']['ratingsCount'] except KeyError: num_reviews = 0 try: if item['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 author_fuzz = fuzz.token_set_ratio(Author, authorname) book_fuzz = fuzz.token_set_ratio( item['volumeInfo']['title'], authorname) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) bookname = item['volumeInfo']['title'] bookname = bookname.replace( ':', '').replace('"', '').replace("'", "") bookname = unidecode(u'%s' % bookname) bookname = bookname.strip() # strip whitespace bookid = item['id'] resultlist.append({ 'authorname': Author, 'bookid': bookid, 'bookname': bookname, 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': num_reviews }) resultcount = resultcount + 1 except KeyError: break logger.debug("Found %s total results" % total_count) logger.debug("Removed %s bad language results" % ignored) logger.debug("Removed %s books with no author" % no_author_count) logger.debug( "Showing %s results for (%s) with keyword: %s" % (resultcount, api_value, authorname)) logger.debug( 'The Google Books API was hit %s times for keyword %s' % (str(api_hits), self.name)) queue.put(resultlist)
def processDir(force=False, reset=False): # rename this thread threading.currentThread().name = "POSTPROCESS" if not lazylibrarian.DOWNLOAD_DIR or not os.path.isdir(lazylibrarian.DOWNLOAD_DIR): processpath = os.getcwd() else: processpath = lazylibrarian.DOWNLOAD_DIR logger.debug(' Checking [%s] for files to post process' % processpath) try: downloads = os.listdir(processpath) except OSError: logger.error('Could not access [%s] directory ' % processpath) return False myDB = database.DBConnection() snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"') if force is False and len(snatched) == 0: logger.info('Nothing marked as snatched. Stopping postprocessor job.') common.schedule_job(action='Stop', target='processDir') elif len(downloads) == 0: logger.info('No downloads are found. Nothing to process.') else: ppcount = 0 for book in snatched: found = False for fname in downloads: if not fname.endswith('.fail'): # has this failed before? # this is to get round unicode differences in torrent filenames. # there might be a better way... if isinstance(fname, str): matchname = fname.decode('utf-8') else: matchname = fname if 'LL.(' in matchname: matchname = matchname.split('LL.(')[0] match = fuzz.token_set_ratio(matchname, book['NZBtitle']) if match >= 95: pp_path = os.path.join(processpath, fname) logger.debug('Found folder %s for %s' % (pp_path, book['NZBtitle'])) found = True break if found: data = myDB.select('SELECT * from books WHERE BookID="%s"' % book['BookID']) if data: authorname = data[0]['AuthorName'] bookname = data[0]['BookName'] if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER: logger.warn('Please check your EBOOK_DEST_FOLDER setting') lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\') # Default destination path, should be allowed change per config file. dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace( '$Title', bookname) global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace( '$Title', bookname) # dest_path = authorname+'/'+bookname # global_name = bookname + ' - ' + authorname # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic)) dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode( lazylibrarian.SYS_ENCODING) else: data = myDB.select('SELECT * from magazines WHERE Title="%s"' % book['BookID']) if data: # AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple # files are downloading, there will be an error in post-processing, trying to go to the # same directory. mostrecentissue = data[0]['IssueDate'] # keep for processing issues arriving out of order # Remove characters we don't want in the filename before (maybe) adding to DESTINATION_DIR # as windows drive identifiers have colon, eg c: but no colons allowed elsewhere? dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''} mag_name = formatter.latinToAscii(formatter.replace_all(book['BookID'], dic)) # book auxinfo is a cleaned date, eg 2015-01-01 dest_path = lazylibrarian.MAG_DEST_FOLDER.replace('$IssueDate', book['AuxInfo']).replace('$Title', mag_name) # dest_path = '_Magazines/'+title+'/'+book['AuxInfo'] if lazylibrarian.MAG_RELATIVE: if dest_path[0] not in '._': dest_path = '_' + dest_path dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode( lazylibrarian.SYS_ENCODING) else: dest_path = dest_path.encode(lazylibrarian.SYS_ENCODING) authorname = None bookname = None global_name = lazylibrarian.MAG_DEST_FILE.replace('$IssueDate', book['AuxInfo']).replace( '$Title', mag_name) # global_name = book['AuxInfo']+' - '+title else: logger.debug("Snatched magazine %s is not in download directory" % (book['BookID'])) continue else: logger.debug("Snatched %s %s is not in download directory" % (book['NZBmode'], book['NZBtitle'])) continue # try: # os.chmod(dest_path, 0777) # except Exception, e: # logger.debug("Could not chmod post-process directory: " + str(dest_path)) processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name) if processBook: logger.debug("Processing %s, %s" % (global_name, book['NZBurl'])) # update nzbs, only update the snatched ones in case multiple matches for same book / magazine issue controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"} newValueDict = {"Status": "Processed", "NZBDate": formatter.now()} # say when we processed it myDB.upsert("wanted", newValueDict, controlValueDict) if bookname is not None: # it's a book, if None it's a magazine processExtras(myDB, dest_path, global_name, data) else: # update mags controlValueDict = {"Title": book['BookID']} if mostrecentissue > book['AuxInfo']: # check this in case processing issues arriving out of order newValueDict = {"LastAcquired": formatter.today(), "IssueStatus": "Open"} else: newValueDict = {"IssueDate": book['AuxInfo'], "LastAcquired": formatter.today(), "IssueStatus": "Open"} myDB.upsert("magazines", newValueDict, controlValueDict) # dest_path is where we put the magazine after processing, but we don't have the full filename # so look for any "book" in that directory dest_file = book_file(dest_path, booktype='mag') controlValueDict = {"Title": book['BookID'], "IssueDate": book['AuxInfo']} newValueDict = {"IssueAcquired": formatter.today(), "IssueFile": dest_file, "IssueID" : magazinescan.create_id("%s %s" % (book['BookID'], book['AuxInfo'])) } myDB.upsert("issues", newValueDict, controlValueDict) # create a thumbnail cover for the new issue magazinescan.create_cover(dest_file) logger.info('Successfully processed: %s' % global_name) ppcount = ppcount + 1 notifiers.notify_download(formatter.latinToAscii(global_name) + ' at ' + formatter.now()) else: logger.error('Postprocessing for %s has failed.' % global_name) logger.error('Warning - Residual files remain in %s.fail' % pp_path) # at this point, as it failed we should move it or it will get postprocessed # again (and fail again) try: os.rename(pp_path, pp_path + '.fail') except: logger.debug("Unable to rename %s" % pp_path) downloads = os.listdir(processpath) # check in case we processed/deleted some above for directory in downloads: if "LL.(" in directory and not directory.endswith('.fail'): bookID = str(directory).split("LL.(")[1].split(")")[0] logger.debug("Book with id: " + str(bookID) + " is in downloads") pp_path = os.path.join(processpath, directory) if os.path.isfile(pp_path): pp_path = os.path.join(processpath) if (os.path.isdir(pp_path)): logger.debug('Found LL folder %s.' % pp_path) if import_book(pp_path, bookID): ppcount = ppcount + 1 if ppcount: logger.info('%s books/mags have been processed.' % ppcount) else: logger.info('No snatched books/mags have been found') if reset == True: common.schedule_job(action='Restart', target='processDir')
def find_results(self, authorname=None, queue=None): resultlist = [] api_hits = 0 # Goodreads doesn't like initials followed by spaces, # eg "M L Hamilton", needs "M. L. Hamilton" or "M.L.Hamilton" # but DOES need spaces if not initials eg "Tom.Holt" fails, but "Tom Holt" works if authorname[1] == ' ': authorname = authorname.replace(' ', '.') authorname = authorname.replace('..', '.') url = urllib.quote_plus(authorname.encode(lazylibrarian.SYS_ENCODING)) set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode( self.params) logger.debug('Now searching GoodReads API with keyword: ' + authorname) logger.debug('Searching for %s at: %s' % (authorname, set_url)) try: try: rootxml, in_cache = get_xml_request(set_url) except Exception as e: logger.error("Error finding results: %s" % e) return if not len(rootxml): logger.debug("Error requesting results") return resultxml = rootxml.getiterator('work') resultcount = 0 for author in resultxml: bookdate = "0001-01-01" if (author.find('original_publication_year').text is None): bookdate = "0000" else: bookdate = author.find('original_publication_year').text authorNameResult = author.find('./best_book/author/name').text booksub = "" bookpub = "" booklang = "Unknown" try: bookimg = author.find('./best_book/image_url').text if (bookimg == 'http://www.goodreads.com/assets/nocover/111x148.png' ): bookimg = 'images/nocover.png' except (KeyError, AttributeError): bookimg = 'images/nocover.png' try: bookrate = author.find('average_rating').text except KeyError: bookrate = 0 bookpages = '0' bookgenre = '' bookdesc = '' bookisbn = '' booklink = 'http://www.goodreads.com/book/show/' + author.find( './best_book/id').text if (author.find('./best_book/title').text is None): bookTitle = "" else: bookTitle = author.find('./best_book/title').text author_fuzz = fuzz.token_set_ratio(authorNameResult, authorname) book_fuzz = fuzz.token_set_ratio(bookTitle, authorname) try: isbn_check = int(authorname[:-1]) if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12): isbn_fuzz = int(100) else: isbn_fuzz = int(0) except: isbn_fuzz = int(0) highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz) bookid = author.find('./best_book/id').text resultlist.append({ 'authorname': author.find('./best_book/author/name').text, 'bookid': bookid, 'authorid': author.find('./best_book/author/id').text, 'bookname': bookTitle.encode("ascii", "ignore"), 'booksub': None, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': booklink, 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc, 'author_fuzz': author_fuzz, 'book_fuzz': book_fuzz, 'isbn_fuzz': isbn_fuzz, 'highest_fuzz': highest_fuzz, 'num_reviews': float(bookrate) }) resultcount = resultcount + 1 except urllib2.HTTPError as err: if err.code == 404: logger.error('Received a 404 error when searching for author') if err.code == 403: logger.warn('Access to api is denied: usage exceeded') else: logger.error( 'An unexpected error has occurred when searching for an author' ) logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), authorname)) logger.debug('The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), authorname)) queue.put(resultlist)
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': '', '\s\s': ' ', ' the ': ' ', ' a ': ' ', ' and ': ' ', ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} match_ratio = int(lazylibrarian.MATCH_RATIO) for nzb in resultlist: nzbTitle = formatter.latinToAscii(formatter.replace_all(nzb['nzbtitle'], dictrepl)).strip() nzbTitle = re.sub(r"\s\s+", " ", nzbTitle) # remove extra whitespace #nzbTitle_match = fuzz.token_set_ratio(book['searchterm'], nzbTitle) #logger.debug(u"NZB Title sort Match %: " + str(nzbTitle_match) + " for " + nzbTitle) if searchtype == 'book' or searchtype == 'shortbook': nzbTitle_match = fuzz.token_set_ratio(book['searchterm'], nzbTitle) logger.debug(u"NZB token set Match %: " + str(nzbTitle_match) + " for " + nzbTitle) elif searchtype == 'author': nzbTitle_match = fuzz.token_set_ratio(book['authorName'].encode('utf-8'), nzbTitle) logger.debug(u"NZB author Match %: " + str(nzbTitle_match) + " for " + nzbTitle) if nzbTitle_match > match_ratio: nzbTitle_match = fuzz.token_set_ratio(book['bookName'].encode('utf-8'), nzbTitle) logger.debug(u"NZB book Match %: " + str(nzbTitle_match) + " for " + nzbTitle) else: # searchtype == 'general': nzbTitle_match = fuzz.token_set_ratio(book['searchterm'], nzbTitle) logger.debug(u"NZB Title general Match %: " + str(nzbTitle_match) + " for " + nzbTitle) if (nzbTitle_match > match_ratio): logger.debug(u'Found NZB: %s using %s search' % (nzb['nzbtitle'], searchtype)) bookid = book['bookid'] nzbTitle = (book["authorName"] + ' - ' + book['bookName'] + ' LL.(' + book['bookid'] + ')').strip() nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": nzbdate, "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone() if not snatchedbooks: if nzbmode == "torznab": snatch = TORDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) else: snatch = NZBDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) if snatch: notifiers.notify_snatch(formatter.latinToAscii(nzbTitle) + ' at ' + formatter.now()) postprocess.schedule_processor(action='Start') return True logger.debug("No nzb's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + " using searchtype " + searchtype) return False
def processResultList(resultlist, book, searchtype): myDB = database.DBConnection() dictrepl = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': ' ', '*': '', '(': '', ')': '', '[': '', ']': '', '#': '', '0': '', '1': '', '2': '', '3': '', '4': '', '5': '', '6': '', '7': '', '8': '', '9': '', '\'': '', ':': '', '!': '', '-': ' ', '\s\s': ' '} # ' the ': ' ', ' a ': ' ', ' and ': ' ', # ' to ': ' ', ' of ': ' ', ' for ': ' ', ' my ': ' ', ' in ': ' ', ' at ': ' ', ' with ': ' '} dic = {'...': '', '.': ' ', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': ''} match_ratio = int(lazylibrarian.MATCH_RATIO) reject_list = formatter.getList(lazylibrarian.REJECT_WORDS) for nzb in resultlist: nzbTitle = formatter.latinToAscii(formatter.replace_all(nzb['nzbtitle'], dictrepl)).strip() nzbTitle = re.sub(r"\s\s+", " ", nzbTitle) # remove extra whitespace author = formatter.latinToAscii(formatter.replace_all(book['authorName'], dic)) title = formatter.latinToAscii(formatter.replace_all(book['bookName'], dic)) # nzbTitle_match = fuzz.token_set_ratio(book['searchterm'], nzbTitle) # logger.debug(u"NZB Title sort Match %: " + str(nzbTitle_match) + " for " + nzbTitle) nzbAuthor_match = fuzz.token_set_ratio(author, nzbTitle) nzbBook_match = fuzz.token_set_ratio(title, nzbTitle) logger.debug(u"NZB author/book Match: %s/%s for %s" % (nzbAuthor_match, nzbBook_match, nzbTitle)) rejected = False for word in reject_list: if word in nzbTitle.lower() and not word in author.lower() and not word in title.lower(): rejected = True logger.debug("Rejecting %s, contains %s" % (nzbTitle, word)) break if (nzbAuthor_match >= match_ratio and nzbBook_match >= match_ratio and not rejected): logger.debug(u'Found NZB: %s using %s search' % (nzb['nzbtitle'], searchtype)) bookid = book['bookid'] nzbTitle = (author + ' - ' + title + ' LL.(' + book['bookid'] + ')').strip() nzburl = nzb['nzburl'] nzbprov = nzb['nzbprov'] nzbdate_temp = nzb['nzbdate'] nzbsize_temp = nzb['nzbsize'] # Need to cater for when this is NONE (Issue 35) if nzbsize_temp is None: nzbsize_temp = 1000 nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB' nzbdate = formatter.nzbdate2format(nzbdate_temp) nzbmode = nzb['nzbmode'] controlValueDict = {"NZBurl": nzburl} newValueDict = { "NZBprov": nzbprov, "BookID": bookid, "NZBdate": formatter.now(), # when we asked for it "NZBsize": nzbsize, "NZBtitle": nzbTitle, "NZBmode": nzbmode, "Status": "Skipped" } myDB.upsert("wanted", newValueDict, controlValueDict) snatchedbooks = myDB.action('SELECT * from books WHERE BookID="%s" and Status="Snatched"' % bookid).fetchone() if not snatchedbooks: if nzbmode == "torznab": snatch = TORDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) else: snatch = NZBDownloadMethod(bookid, nzbprov, nzbTitle, nzburl) if snatch: notifiers.notify_snatch(nzbTitle + ' at ' + formatter.now()) common.schedule_job(action='Start', target='processDir') return True logger.debug("No nzb's found for " + (book["authorName"] + ' ' + book['bookName']).strip() + " using searchtype " + searchtype) return False
def getSeriesAuthors(seriesid): """ Get a list of authors contributing to a series and import those authors (and their books) into the database Return how many authors you added """ myDB = database.DBConnection() result = myDB.match("select count('AuthorID') as counter from authors") start = int(result['counter']) result = myDB.match('select SeriesName from series where SeriesID="%s"' % seriesid) seriesname = result['SeriesName'] members = getSeriesMembers(seriesid) if members: myDB = database.DBConnection() for member in members: #order = member[0] bookname = member[1] authorname = member[2] base_url = 'http://www.goodreads.com/search.xml?q=' params = {"key": lazylibrarian.CONFIG['GR_API']} searchname = bookname + ' ' + authorname searchname = cleanName(unaccented(searchname)) searchterm = urllib.quote_plus( searchname.encode(lazylibrarian.SYS_ENCODING)) set_url = base_url + searchterm + '&' + urllib.urlencode(params) authorid = '' try: rootxml, in_cache = get_xml_request(set_url) if len(rootxml): resultxml = rootxml.getiterator('work') for item in resultxml: booktitle = item.find('./best_book/title').text book_fuzz = fuzz.token_set_ratio(booktitle, bookname) if book_fuzz >= 98: author = item.find('./best_book/author/name').text authorid = item.find('./best_book/author/id').text logger.debug( "Author Search found %s %s, authorid %s" % (author, booktitle, authorid)) break if not authorid: # try again with title only searchname = cleanName(unaccented(bookname)) searchterm = urllib.quote_plus( searchname.encode(lazylibrarian.SYS_ENCODING)) set_url = base_url + searchterm + '&' + urllib.urlencode( params) rootxml, in_cache = get_xml_request(set_url) if len(rootxml): resultxml = rootxml.getiterator('work') for item in resultxml: booktitle = item.find('./best_book/title').text book_fuzz = fuzz.token_set_ratio( booktitle, bookname) if book_fuzz >= 98: author = item.find( './best_book/author/name').text authorid = item.find( './best_book/author/id').text logger.debug( "Title Search found %s %s, authorid %s" % (author, booktitle, authorid)) break if not authorid: logger.warn("GoodReads doesn't know about %s %s" % (authorname, bookname)) except Exception as e: logger.error("Error finding goodreads results: %s" % str(e)) if authorid: lazylibrarian.importer.addAuthorToDB(refresh=False, authorid=authorid) result = myDB.match("select count('AuthorID') as counter from authors") finish = int(result['counter']) newauth = finish - start logger.info("Added %s new author%s for %s" % (newauth, plural(newauth), seriesname)) return newauth