Пример #1
0
def OLDUsenetCrawler(book=None):


    HOST = lazylibrarian.USENETCRAWLER_HOST
    results = []
    
    print book.keys()
    
    logger.info('UsenetCrawler: Searching term [%s] for author [%s] and title [%s]' % (book['searchterm'], book['authorName'], book['bookName']))
    
    params = {
        "apikey": lazylibrarian.USENETCRAWLER_API,
        "t": "book",
        "title": book['bookName'],
        "author": book['authorName']
        }
    
    #sample request
    #http://www.usenet-crawler.com/api?apikey=7xxxxxxxxxxxxxyyyyyyyyyyyyyyzzz4&t=book&author=Daniel

    logger.debug("%s" % params)
    
    if not str(HOST)[:4] == "http":
        HOST = 'http://' + HOST
    
    URL = HOST + '/api?' + urllib.urlencode(params)
    
    logger.debug('UsenetCrawler: searching on [%s] ' % URL)
    
    data = None    
    try:
        data = ElementTree.parse(urllib2.urlopen(URL, timeout=30))
    except (urllib2.URLError, IOError, EOFError), e:
        logger.Error('Error fetching data from %s: %s' % (HOST, e))
        data = None
Пример #2
0
    def get_author_books(self, authorid=None, authorname=None, refresh=False):
        books_dict = []
        set_url = self.url + urllib.quote('inauthor:' + '"' + authorname + '"')
        URL = set_url + '&' + urllib.urlencode(self.params)

        api_hits = 0
        logger.info('[%s] Now processing books with Google Books API' %
                    authorname)

        #Artist is loading
        myDB = database.DBConnection()
        controlValueDict = {"AuthorID": authorid}
        newValueDict = {"Status": "Loading"}
        myDB.upsert("authors", newValueDict, controlValueDict)

        try:
            startindex = 0
            resultcount = 0
            removedResults = 0
            ignored = 0
            added_count = 0
            updated_count = 0
            book_ignore_count = 0
            total_count = 0

            while True:

                self.params['startIndex'] = startindex
                URL = set_url + '&' + urllib.urlencode(self.params)

                try:
                    jsonresults = json.JSONDecoder().decode(
                        urllib2.urlopen(URL, timeout=30).read())
                    api_hits = api_hits + 1
                    number_results = jsonresults['totalItems']
                    logger.debug('[%s] Searching url: %s' % (authorname, URL))
                    if number_results == 0:
                        logger.info('Found no results for %s with value: %s' %
                                    (api_value, self.name))
                        break
                    else:
                        pass
                except HTTPError, err:
                    logger.Error(
                        'Google API returned HTTP Error - probably time/rate limiting - [%s]'
                        % err.msg)

                startindex = startindex + 40

                for item in jsonresults['items']:

                    total_count = total_count + 1

                    # skip if no author, no author is no book.
                    try:
                        Author = item['volumeInfo']['authors'][0]
                    except KeyError:
                        logger.debug('Skipped a result without authorfield.')
                        continue

                    try:
                        #skip if language is in ignore list
                        booklang = item['volumeInfo']['language']
                        valid_langs = ([
                            valid_lang.strip() for valid_lang in
                            lazylibrarian.IMP_PREFLANG.split(',')
                        ])
                        if booklang not in valid_langs:
                            logger.debug('Skipped a book with language %s' %
                                         booklang)
                            ignored = ignored + 1
                            continue
                    except KeyError:
                        ignored = ignored + 1
                        logger.debug(
                            'Skipped a result where no language is found')
                        continue

                    try:
                        bookpub = item['volumeInfo']['publisher']
                    except KeyError:
                        bookpub = None

                    try:
                        booksub = item['volumeInfo']['subtitle']
                    except KeyError:
                        booksub = None

                    try:
                        bookdate = item['volumeInfo']['publishedDate']
                    except KeyError:
                        bookdate = '0000-00-00'

                    try:
                        bookimg = item['volumeInfo']['imageLinks']['thumbnail']
                    except KeyError:
                        bookimg = 'images/nocover.png'

                    try:
                        bookrate = item['volumeInfo']['averageRating']
                    except KeyError:
                        bookrate = 0

                    try:
                        bookpages = item['volumeInfo']['pageCount']
                    except KeyError:
                        bookpages = 0

                    try:
                        bookgenre = item['volumeInfo']['categories'][0]
                    except KeyError:
                        bookgenre = None

                    try:
                        bookdesc = item['volumeInfo']['description']
                    except KeyError:
                        bookdesc = None

                    try:
                        if item['volumeInfo']['industryIdentifiers'][0][
                                'type'] == 'ISBN_10':
                            bookisbn = item['volumeInfo'][
                                'industryIdentifiers'][0]['identifier']
                        else:
                            bookisbn = None
                    except KeyError:
                        bookisbn = None

                    bookid = item['id']
                    bookname = item['volumeInfo']['title']
                    booklink = item['volumeInfo']['canonicalVolumeLink']
                    bookrate = float(bookrate)

                    find_book_status = myDB.select(
                        "SELECT * FROM books WHERE BookID = '%s'" % bookid)
                    if find_book_status:
                        for resulted in find_book_status:
                            book_status = resulted['Status']
                    else:
                        book_status = "Skipped"

                    if not (re.match('[^\w-]', bookname)
                            ):  #remove books with bad caracters in title
                        if book_status != "Ignored":
                            controlValueDict = {"BookID": bookid}
                            newValueDict = {
                                "AuthorName": authorname,
                                "AuthorID": authorid,
                                "AuthorLink": "",
                                "BookName": bookname,
                                "BookSub": booksub,
                                "BookDesc": bookdesc,
                                "BookIsbn": bookisbn,
                                "BookPub": bookpub,
                                "BookGenre": bookgenre,
                                "BookImg": bookimg,
                                "BookLink": booklink,
                                "BookRate": bookrate,
                                "BookPages": bookpages,
                                "BookDate": bookdate,
                                "BookLang": booklang,
                                "Status": book_status,
                                "BookAdded": formatter.today()
                            }
                            resultcount = resultcount + 1

                            myDB.upsert("books", newValueDict,
                                        controlValueDict)
                            logger.debug(u"book found " + bookname + " " +
                                         bookdate)
                            if not find_book_status:
                                logger.info("[%s] Added book: %s" %
                                            (authorname, bookname))
                                added_count = added_count + 1
                            else:
                                updated_count = updated_count + 1
                                logger.info("[%s] Updated book: %s" %
                                            (authorname, bookname))
                        else:
                            book_ignore_count = book_ignore_count + 1
                    else:
                        removedResults = removedResults + 1

                    if startindex >= number_results:
                        break
                    else:
                        continue

        except KeyError:
            pass

        logger.info(
            '[%s] The Google Books API was hit %s times to populate book list'
            % (authorname, str(api_hits)))

        lastbook = myDB.action(
            "SELECT BookName, BookLink, BookDate from books WHERE AuthorID='%s' AND Status != 'Ignored' order by BookDate DESC"
            % authorid).fetchone()
        unignoredbooks = myDB.select(
            "SELECT COUNT(BookName) as unignored FROM books WHERE AuthorID='%s' AND Status != 'Ignored'"
            % authorid)
        bookCount = myDB.select(
            "SELECT COUNT(BookName) as counter FROM books WHERE AuthorID='%s'"
            % authorid)

        controlValueDict = {"AuthorID": authorid}
        newValueDict = {
            "Status": "Active",
            "TotalBooks": bookCount[0]['counter'],
            "UnignoredBooks": unignoredbooks[0]['unignored'],
            "LastBook": lastbook['BookName'],
            "LastLink": lastbook['BookLink'],
            "LastDate": lastbook['BookDate']
        }
        myDB.upsert("authors", newValueDict, controlValueDict)

        logger.debug("Found %s total books for author" % total_count)
        logger.debug("Removed %s bad language results for author" % ignored)
        logger.debug("Removed %s bad character results for author" %
                     removedResults)
        logger.debug("Ignored %s books by author marked as Ignored" %
                     book_ignore_count)
        logger.debug("Imported/Updated %s books for author" % resultcount)

        if refresh:
            logger.info(
                "[%s] Book processing complete: Added %s books / Updated %s books"
                % (authorname, str(added_count), str(updated_count)))
        else:
            logger.info(
                "[%s] Book processing complete: Added %s books to the database"
                % (authorname, str(added_count)))
        return books_dict
Пример #3
0
    def find_results(self):
        resultlist = []

        if self.type == 'book':
            set_url = self.url + urllib.quote('intitle:' + '"' + self.name +
                                              '"')
        else:
            set_url = self.url + urllib.quote('inauthor:' + '"' + self.name +
                                              '"')

        logger.info('Searching url: ' + set_url)

        try:
            startindex = 0
            resultcount = 0
            ignored = 0
            while True:

                self.params['startIndex'] = startindex
                URL = set_url + '&' + urllib.urlencode(self.params)

                try:
                    jsonresults = json.JSONDecoder().decode(
                        urllib2.urlopen(URL, timeout=30).read())
                except HTTPError, err:
                    logger.Error(
                        'Google API returned HTTP Error - probably time/rate limiting - [%s]'
                        % err.msg)

                startindex = startindex + 40

                for item in jsonresults['items']:

                    # skip if no author, no author is no book.
                    try:
                        authorname = item['volumeInfo']['authors'][0]
                    except KeyError:
                        logger.debug('Skipped a result without authorfield.')
                        break

                    try:
                        #skip if language is in ignore list
                        booklang = item['volumeInfo']['language']
                        if not booklang in lazylibrarian.IMP_PREFLANG:
                            ignored = ignored + 1
                            break
                    except KeyError:
                        ignored = ignored + 1
                        logger.debug(
                            'Skipped a result where no language is found')
                        break

                    try:
                        bookpub = item['volumeInfo']['publisher']
                    except KeyError:
                        bookpub = None

                    try:
                        booksub = item['volumeInfo']['subtitle']
                    except KeyError:
                        booksub = None

                    try:
                        bookdate = item['volumeInfo']['publishedDate']
                    except KeyError:
                        bookdate = '0000/00/00'

                    try:
                        bookimg = item['volumeInfo']['imageLinks']['thumbnail']
                    except KeyError:
                        bookimg = 'images/nocover.png'

                    try:
                        bookrate = item['volumeInfo']['averageRating']
                    except KeyError:
                        bookrate = 0

                    try:
                        bookpages = item['volumeInfo']['pageCount']
                    except KeyError:
                        bookpages = '0'

                    try:
                        bookgenre = item['volumeInfo']['categories'][0]
                    except KeyError:
                        bookgenre = None

                    try:
                        bookdesc = item['volumeInfo']['description']
                    except KeyError:
                        bookdesc = 'Not available'

                    try:
                        if item['volumeInfo']['industryIdentifiers'][0][
                                'type'] == 'ISBN_10':
                            bookisbn = item['volumeInfo'][
                                'industryIdentifiers'][0]['identifier']
                        else:
                            bookisbn = 0
                    except KeyError:
                        bookisbn = 0

                    resultlist.append({
                        'authorname':
                        authorname,
                        'bookid':
                        item['id'],
                        'bookname':
                        item['volumeInfo']['title'],
                        'booksub':
                        booksub,
                        'bookisbn':
                        bookisbn,
                        'bookpub':
                        bookpub,
                        'bookdate':
                        bookdate,
                        'booklang':
                        booklang,
                        'booklink':
                        item['volumeInfo']['canonicalVolumeLink'],
                        'bookrate':
                        float(bookrate),
                        'bookimg':
                        bookimg,
                        'bookpages':
                        bookpages,
                        'bookgenre':
                        bookgenre,
                        'bookdesc':
                        bookdesc
                    })

                    resultcount = resultcount + 1

        except KeyError:
            logger.info('Found %s results for %s with name: %s' %
                        (resultcount, self.type, self.name))
            if ignored > 0:
                logger.info(
                    'Skipped %s results because it is not a preferred language.'
                    % ignored)

        return resultlist