def OLDUsenetCrawler(book=None): HOST = lazylibrarian.USENETCRAWLER_HOST results = [] print book.keys() logger.info('UsenetCrawler: Searching term [%s] for author [%s] and title [%s]' % (book['searchterm'], book['authorName'], book['bookName'])) params = { "apikey": lazylibrarian.USENETCRAWLER_API, "t": "book", "title": book['bookName'], "author": book['authorName'] } #sample request #http://www.usenet-crawler.com/api?apikey=7xxxxxxxxxxxxxyyyyyyyyyyyyyyzzz4&t=book&author=Daniel logger.debug("%s" % params) if not str(HOST)[:4] == "http": HOST = 'http://' + HOST URL = HOST + '/api?' + urllib.urlencode(params) logger.debug('UsenetCrawler: searching on [%s] ' % URL) data = None try: data = ElementTree.parse(urllib2.urlopen(URL, timeout=30)) except (urllib2.URLError, IOError, EOFError), e: logger.Error('Error fetching data from %s: %s' % (HOST, e)) data = None
def get_author_books(self, authorid=None, authorname=None, refresh=False): books_dict = [] set_url = self.url + urllib.quote('inauthor:' + '"' + authorname + '"') URL = set_url + '&' + urllib.urlencode(self.params) api_hits = 0 logger.info('[%s] Now processing books with Google Books API' % authorname) #Artist is loading myDB = database.DBConnection() controlValueDict = {"AuthorID": authorid} newValueDict = {"Status": "Loading"} myDB.upsert("authors", newValueDict, controlValueDict) try: startindex = 0 resultcount = 0 removedResults = 0 ignored = 0 added_count = 0 updated_count = 0 book_ignore_count = 0 total_count = 0 while True: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults = json.JSONDecoder().decode( urllib2.urlopen(URL, timeout=30).read()) api_hits = api_hits + 1 number_results = jsonresults['totalItems'] logger.debug('[%s] Searching url: %s' % (authorname, URL)) if number_results == 0: logger.info('Found no results for %s with value: %s' % (api_value, self.name)) break else: pass except HTTPError, err: logger.Error( 'Google API returned HTTP Error - probably time/rate limiting - [%s]' % err.msg) startindex = startindex + 40 for item in jsonresults['items']: total_count = total_count + 1 # skip if no author, no author is no book. try: Author = item['volumeInfo']['authors'][0] except KeyError: logger.debug('Skipped a result without authorfield.') continue try: #skip if language is in ignore list booklang = item['volumeInfo']['language'] valid_langs = ([ valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',') ]) if booklang not in valid_langs: logger.debug('Skipped a book with language %s' % booklang) ignored = ignored + 1 continue except KeyError: ignored = ignored + 1 logger.debug( 'Skipped a result where no language is found') continue try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000-00-00' try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = 0 try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = None try: if item['volumeInfo']['industryIdentifiers'][0][ 'type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = None except KeyError: bookisbn = None bookid = item['id'] bookname = item['volumeInfo']['title'] booklink = item['volumeInfo']['canonicalVolumeLink'] bookrate = float(bookrate) find_book_status = myDB.select( "SELECT * FROM books WHERE BookID = '%s'" % bookid) if find_book_status: for resulted in find_book_status: book_status = resulted['Status'] else: book_status = "Skipped" if not (re.match('[^\w-]', bookname) ): #remove books with bad caracters in title if book_status != "Ignored": controlValueDict = {"BookID": bookid} newValueDict = { "AuthorName": authorname, "AuthorID": authorid, "AuthorLink": "", "BookName": bookname, "BookSub": booksub, "BookDesc": bookdesc, "BookIsbn": bookisbn, "BookPub": bookpub, "BookGenre": bookgenre, "BookImg": bookimg, "BookLink": booklink, "BookRate": bookrate, "BookPages": bookpages, "BookDate": bookdate, "BookLang": booklang, "Status": book_status, "BookAdded": formatter.today() } resultcount = resultcount + 1 myDB.upsert("books", newValueDict, controlValueDict) logger.debug(u"book found " + bookname + " " + bookdate) if not find_book_status: logger.info("[%s] Added book: %s" % (authorname, bookname)) added_count = added_count + 1 else: updated_count = updated_count + 1 logger.info("[%s] Updated book: %s" % (authorname, bookname)) else: book_ignore_count = book_ignore_count + 1 else: removedResults = removedResults + 1 if startindex >= number_results: break else: continue except KeyError: pass logger.info( '[%s] The Google Books API was hit %s times to populate book list' % (authorname, str(api_hits))) lastbook = myDB.action( "SELECT BookName, BookLink, BookDate from books WHERE AuthorID='%s' AND Status != 'Ignored' order by BookDate DESC" % authorid).fetchone() unignoredbooks = myDB.select( "SELECT COUNT(BookName) as unignored FROM books WHERE AuthorID='%s' AND Status != 'Ignored'" % authorid) bookCount = myDB.select( "SELECT COUNT(BookName) as counter FROM books WHERE AuthorID='%s'" % authorid) controlValueDict = {"AuthorID": authorid} newValueDict = { "Status": "Active", "TotalBooks": bookCount[0]['counter'], "UnignoredBooks": unignoredbooks[0]['unignored'], "LastBook": lastbook['BookName'], "LastLink": lastbook['BookLink'], "LastDate": lastbook['BookDate'] } myDB.upsert("authors", newValueDict, controlValueDict) logger.debug("Found %s total books for author" % total_count) logger.debug("Removed %s bad language results for author" % ignored) logger.debug("Removed %s bad character results for author" % removedResults) logger.debug("Ignored %s books by author marked as Ignored" % book_ignore_count) logger.debug("Imported/Updated %s books for author" % resultcount) if refresh: logger.info( "[%s] Book processing complete: Added %s books / Updated %s books" % (authorname, str(added_count), str(updated_count))) else: logger.info( "[%s] Book processing complete: Added %s books to the database" % (authorname, str(added_count))) return books_dict
def find_results(self): resultlist = [] if self.type == 'book': set_url = self.url + urllib.quote('intitle:' + '"' + self.name + '"') else: set_url = self.url + urllib.quote('inauthor:' + '"' + self.name + '"') logger.info('Searching url: ' + set_url) try: startindex = 0 resultcount = 0 ignored = 0 while True: self.params['startIndex'] = startindex URL = set_url + '&' + urllib.urlencode(self.params) try: jsonresults = json.JSONDecoder().decode( urllib2.urlopen(URL, timeout=30).read()) except HTTPError, err: logger.Error( 'Google API returned HTTP Error - probably time/rate limiting - [%s]' % err.msg) startindex = startindex + 40 for item in jsonresults['items']: # skip if no author, no author is no book. try: authorname = item['volumeInfo']['authors'][0] except KeyError: logger.debug('Skipped a result without authorfield.') break try: #skip if language is in ignore list booklang = item['volumeInfo']['language'] if not booklang in lazylibrarian.IMP_PREFLANG: ignored = ignored + 1 break except KeyError: ignored = ignored + 1 logger.debug( 'Skipped a result where no language is found') break try: bookpub = item['volumeInfo']['publisher'] except KeyError: bookpub = None try: booksub = item['volumeInfo']['subtitle'] except KeyError: booksub = None try: bookdate = item['volumeInfo']['publishedDate'] except KeyError: bookdate = '0000/00/00' try: bookimg = item['volumeInfo']['imageLinks']['thumbnail'] except KeyError: bookimg = 'images/nocover.png' try: bookrate = item['volumeInfo']['averageRating'] except KeyError: bookrate = 0 try: bookpages = item['volumeInfo']['pageCount'] except KeyError: bookpages = '0' try: bookgenre = item['volumeInfo']['categories'][0] except KeyError: bookgenre = None try: bookdesc = item['volumeInfo']['description'] except KeyError: bookdesc = 'Not available' try: if item['volumeInfo']['industryIdentifiers'][0][ 'type'] == 'ISBN_10': bookisbn = item['volumeInfo'][ 'industryIdentifiers'][0]['identifier'] else: bookisbn = 0 except KeyError: bookisbn = 0 resultlist.append({ 'authorname': authorname, 'bookid': item['id'], 'bookname': item['volumeInfo']['title'], 'booksub': booksub, 'bookisbn': bookisbn, 'bookpub': bookpub, 'bookdate': bookdate, 'booklang': booklang, 'booklink': item['volumeInfo']['canonicalVolumeLink'], 'bookrate': float(bookrate), 'bookimg': bookimg, 'bookpages': bookpages, 'bookgenre': bookgenre, 'bookdesc': bookdesc }) resultcount = resultcount + 1 except KeyError: logger.info('Found %s results for %s with name: %s' % (resultcount, self.type, self.name)) if ignored > 0: logger.info( 'Skipped %s results because it is not a preferred language.' % ignored) return resultlist