Exemplos de ratio em Python, exemplos de lib.fuzzywuzzy.fuzz.ratio em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: librarysync.py Projeto: MaDTaZ/LazyLibrarian

def find_book_in_db(myDB, author, book):
    # prefer an exact match on author & book
    match = myDB.action("SELECT BookID FROM books where AuthorName=? and BookName=?", [author, book]).fetchone()
    if match:
        logger.debug("Exact match [%s]" % book)
        return match["BookID"]
    else:
        # No exact match
        # Try a more complex fuzzy match against each book in the db by this author
        # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>65)
        # These are results that work well on my library, minimal false matches and no misses on books that should be matched
        # Maybe make ratios configurable in config.ini later
        #
        books = myDB.select('SELECT BookID,BookName FROM books where AuthorName="%s"' % author)
        best_ratio = 0
        best_partial = 0
        ratio_name = ""
        partial_name = ""
        ratio_id = 0
        partial_id = 0
        logger.debug("Found %s books for %s" % (len(books), author))
        for a_book in books:
            # lowercase everything to raise fuzziness scores
            book_lower = book.lower()
            a_book_lower = a_book["BookName"].lower()
            #
            ratio = fuzz.ratio(book_lower, a_book_lower)
            partial = fuzz.partial_ratio(book_lower, a_book_lower)
            if ratio > best_ratio:
                best_ratio = ratio
                ratio_name = a_book["BookName"]
                ratio_id = a_book["BookID"]
            if partial > best_partial:
                best_partial = partial
                partial_name = a_book["BookName"]
                partial_id = a_book["BookID"]

            else:
                if partial == best_partial:
                    # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest
                    # this eliminates most false matches against omnibuses
                    if a_book_lower.find(book_lower) < partial_name.lower().find(book_lower):
                        logger.debug("Fuzz left prefer [%s] over [%s]" % (a_book["BookName"], partial_name))
                        best_partial = partial
                        partial_name = a_book["BookName"]
                        partial_id = a_book["BookID"]
                        #
        if best_ratio > 90:
            logger.debug("Fuzz match   ratio [%d] [%s] [%s]" % (best_ratio, book, ratio_name))
            return ratio_id
        if best_partial > 65:
            logger.debug("Fuzz match partial [%d] [%s] [%s]" % (best_partial, book, partial_name))
            return partial_id

        logger.debug(
            "Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s]"
            % (author, book, best_ratio, ratio_name, best_partial, partial_name)
        )
        return 0

Exemplo n.º 2

0

Exibir arquivo

    def find_results(self, searchterm=None, queue=None):
        """ GoogleBooks performs much better if we search for author OR title
            not both at once, so if searchterm is not isbn, two searches needed.
            Lazylibrarian searches use <ll> to separate title from author in searchterm
            If this token isn't present, it's an isbn or searchterm as supplied by user
        """
        try:
            myDB = database.DBConnection()
            resultlist = []
            # See if we should check ISBN field, otherwise ignore it
            api_strings = ['inauthor:', 'intitle:']
            if is_valid_isbn(searchterm):
                api_strings = ['isbn:']

            api_hits = 0

            ignored = 0
            total_count = 0
            no_author_count = 0

            if ' <ll> ' in searchterm:  # special token separates title from author
                title, authorname = searchterm.split(' <ll> ')
            else:
                title = ''
                authorname = ''

            fullterm = searchterm.replace(' <ll> ', '')
            logger.debug('Now searching Google Books API with searchterm: %s' %
                         fullterm)

            for api_value in api_strings:
                set_url = self.url
                if api_value == "isbn:":
                    set_url = set_url + urllib.quote(
                        api_value +
                        searchterm.encode(lazylibrarian.SYS_ENCODING))
                elif api_value == 'intitle:':
                    searchterm = fullterm
                    if title:  # just search for title
                        title = title.split(' (')[
                            0]  # with out any series info
                        searchterm = title
                    searchterm = searchterm.replace("'", "").replace(
                        '"', '')  # and no quotes
                    searchterm = searchterm.strip()
                    set_url = set_url + \
                              urllib.quote(api_value + '"' + searchterm.encode(lazylibrarian.SYS_ENCODING) + '"')
                elif api_value == 'inauthor:':
                    searchterm = fullterm
                    if authorname:
                        searchterm = authorname  # just search for author
                    set_url = set_url + \
                              urllib.quote(api_value + '"' + searchterm.encode(lazylibrarian.SYS_ENCODING) + '"')
                    searchterm = searchterm.strip()

                startindex = 0
                resultcount = 0
                ignored = 0
                number_results = 1
                total_count = 0
                no_author_count = 0
                try:
                    while startindex < number_results:

                        self.params['startIndex'] = startindex
                        URL = set_url + '&' + urllib.urlencode(self.params)

                        try:
                            jsonresults, in_cache = get_json_request(URL)
                            if not jsonresults:
                                number_results = 0
                            else:
                                if not in_cache:
                                    api_hits += 1
                                number_results = jsonresults['totalItems']
                                logger.debug('Searching url: ' + URL)
                            if number_results == 0:
                                logger.warn(
                                    'Found no results for %s with value: %s' %
                                    (api_value, searchterm))
                                break
                            else:
                                pass
                        except HTTPError as err:
                            logger.warn(
                                'Google Books API Error [%s]: Check your API key or wait a while'
                                % err.reason)
                            break

                        startindex += 40

                        for item in jsonresults['items']:

                            total_count += 1

                            # skip if no author, no author is no book.
                            try:
                                Author = item['volumeInfo']['authors'][0]
                            except KeyError:
                                logger.debug(
                                    'Skipped a result without authorfield.')
                                no_author_count += 1
                                continue
                            try:
                                bookname = item['volumeInfo']['title']
                            except KeyError:
                                logger.debug('Skipped a result without title.')
                                continue

                            valid_langs = getList(
                                lazylibrarian.CONFIG['IMP_PREFLANG'])
                            booklang = ''
                            if "All" not in valid_langs:  # don't care about languages, accept all
                                try:
                                    # skip if language is not in valid list -
                                    booklang = item['volumeInfo']['language']
                                    if booklang not in valid_langs:
                                        logger.debug(
                                            'Skipped %s with language %s' %
                                            (bookname, booklang))
                                        ignored += 1
                                        continue
                                except KeyError:
                                    ignored += 1
                                    logger.debug(
                                        'Skipped %s where no language is found'
                                        % bookname)
                                    continue

                            try:
                                bookpub = item['volumeInfo']['publisher']
                            except KeyError:
                                bookpub = ""

                            try:
                                booksub = item['volumeInfo']['subtitle']
                            except KeyError:
                                booksub = ""

                            try:
                                bookdate = item['volumeInfo']['publishedDate']
                            except KeyError:
                                bookdate = '0000-00-00'
                            bookdate = bookdate[:4]

                            try:
                                bookimg = item['volumeInfo']['imageLinks'][
                                    'thumbnail']
                            except KeyError:
                                bookimg = 'images/nocover.png'

                            try:
                                bookrate = item['volumeInfo']['averageRating']
                            except KeyError:
                                bookrate = 0

                            try:
                                bookpages = item['volumeInfo']['pageCount']
                            except KeyError:
                                bookpages = '0'

                            try:
                                bookgenre = item['volumeInfo']['categories'][0]
                            except KeyError:
                                bookgenre = ""

                            try:
                                bookdesc = item['volumeInfo']['description']
                            except KeyError:
                                bookdesc = 'Not available'

                            try:
                                num_reviews = item['volumeInfo'][
                                    'ratingsCount']
                            except KeyError:
                                num_reviews = 0

                            try:
                                if item['volumeInfo']['industryIdentifiers'][
                                        0]['type'] == 'ISBN_10':
                                    bookisbn = item['volumeInfo'][
                                        'industryIdentifiers'][0]['identifier']
                                else:
                                    bookisbn = 0
                            except KeyError:
                                bookisbn = 0

                            if authorname:
                                author_fuzz = fuzz.ratio(Author, authorname)
                            else:
                                author_fuzz = fuzz.ratio(Author, fullterm)

                            if title:
                                book_fuzz = fuzz.ratio(bookname, title)
                                # lose a point for each extra word in the fuzzy matches so we get the closest match
                                words = len(getList(bookname))
                                words -= len(getList(title))
                                book_fuzz -= abs(words)
                            else:
                                book_fuzz = fuzz.ratio(bookname, fullterm)

                            isbn_fuzz = 0
                            if is_valid_isbn(fullterm):
                                isbn_fuzz = 100

                            highest_fuzz = max((author_fuzz + book_fuzz) / 2,
                                               isbn_fuzz)

                            dic = {':': '.', '"': '', '\'': ''}
                            bookname = replace_all(bookname, dic)

                            bookname = unaccented(bookname)
                            bookname = bookname.strip()  # strip whitespace
                            bookid = item['id']

                            author = myDB.select(
                                'SELECT AuthorID FROM authors WHERE AuthorName = "%s"'
                                % Author.replace('"', '""'))
                            if author:
                                AuthorID = author[0]['authorid']
                            else:
                                AuthorID = ''

                            resultlist.append({
                                'authorname':
                                Author,
                                'authorid':
                                AuthorID,
                                'bookid':
                                bookid,
                                'bookname':
                                bookname,
                                'booksub':
                                booksub,
                                'bookisbn':
                                bookisbn,
                                'bookpub':
                                bookpub,
                                'bookdate':
                                bookdate,
                                'booklang':
                                booklang,
                                'booklink':
                                item['volumeInfo']['canonicalVolumeLink'],
                                'bookrate':
                                float(bookrate),
                                'bookimg':
                                bookimg,
                                'bookpages':
                                bookpages,
                                'bookgenre':
                                bookgenre,
                                'bookdesc':
                                bookdesc,
                                'author_fuzz':
                                author_fuzz,
                                'book_fuzz':
                                book_fuzz,
                                'isbn_fuzz':
                                isbn_fuzz,
                                'highest_fuzz':
                                highest_fuzz,
                                'num_reviews':
                                num_reviews
                            })

                            resultcount += 1

                except KeyError:
                    break

                logger.debug(
                    "Returning %s result%s for (%s) with keyword: %s" %
                    (resultcount, plural(resultcount), api_value, searchterm))

            logger.debug("Found %s result%s" %
                         (total_count, plural(total_count)))
            logger.debug("Removed %s unwanted language result%s" %
                         (ignored, plural(ignored)))
            logger.debug("Removed %s book%s with no author" %
                         (no_author_count, plural(no_author_count)))
            logger.debug(
                'The Google Books API was hit %s time%s for searchterm: %s' %
                (api_hits, plural(api_hits), fullterm))
            queue.put(resultlist)

        except Exception:
            logger.error('Unhandled exception in GB.find_results: %s' %
                         traceback.format_exc())

Exemplo n.º 3

0

Exibir arquivo

    def find_results(self, searchterm=None, queue=None):
        """ GoogleBooks performs much better if we search for author OR title
            not both at once, so if searchterm is not isbn, two searches needed.
            Lazylibrarian searches use <ll> to separate title from author in searchterm
            If this token isn't present, it's an isbn or searchterm as supplied by user
        """
        try:
            myDB = database.DBConnection()
            resultlist = []
            # See if we should check ISBN field, otherwise ignore it
            api_strings = ['inauthor:', 'intitle:']
            if is_valid_isbn(searchterm):
                api_strings = ['isbn:']

            api_hits = 0

            ignored = 0
            total_count = 0
            no_author_count = 0
            title = ''
            authorname = ''

            if ' <ll> ' in searchterm:  # special token separates title from author
                title, authorname = searchterm.split(' <ll> ')

            fullterm = searchterm.replace(' <ll> ', ' ')
            logger.debug('Now searching Google Books API with searchterm: %s' %
                         fullterm)

            for api_value in api_strings:
                set_url = self.url
                if api_value == "isbn:":
                    set_url = set_url + quote(api_value + searchterm)
                elif api_value == 'intitle:':
                    searchterm = fullterm
                    if title:  # just search for title
                        # noinspection PyUnresolvedReferences
                        title = title.split(' (')[0]  # without any series info
                        searchterm = title
                    searchterm = searchterm.replace("'", "").replace(
                        '"', '').strip()  # and no quotes
                    if PY2:
                        searchterm = searchterm.encode(
                            lazylibrarian.SYS_ENCODING)
                    set_url = set_url + quote(api_value + '"' + searchterm +
                                              '"')
                elif api_value == 'inauthor:':
                    searchterm = fullterm
                    if authorname:
                        searchterm = authorname  # just search for author
                    searchterm = searchterm.strip()
                    if PY2:
                        searchterm = searchterm.encode(
                            lazylibrarian.SYS_ENCODING)
                    set_url = set_url + quote_plus(api_value + '"' +
                                                   searchterm + '"')

                startindex = 0
                resultcount = 0
                ignored = 0
                number_results = 1
                total_count = 0
                no_author_count = 0
                try:
                    while startindex < number_results:

                        self.params['startIndex'] = startindex
                        URL = set_url + '&' + urlencode(self.params)

                        try:
                            jsonresults, in_cache = gb_json_request(URL)
                            if jsonresults is None:
                                number_results = 0
                            else:
                                if not in_cache:
                                    api_hits += 1
                                number_results = jsonresults['totalItems']
                                logger.debug('Searching url: ' + URL)
                            if number_results == 0:
                                logger.warn(
                                    'Found no results for %s with value: %s' %
                                    (api_value, searchterm))
                                break
                            else:
                                pass
                        except Exception as err:
                            if hasattr(err, 'reason'):
                                errmsg = err.reason
                            else:
                                errmsg = str(err)
                            logger.warn(
                                'Google Books API Error [%s]: Check your API key or wait a while'
                                % errmsg)
                            break

                        startindex += 40

                        for item in jsonresults['items']:
                            total_count += 1

                            book = bookdict(item)
                            if not book['author']:
                                logger.debug(
                                    'Skipped a result without authorfield.')
                                no_author_count += 1
                                continue

                            if not book['name']:
                                logger.debug('Skipped a result without title.')
                                continue

                            valid_langs = getList(
                                lazylibrarian.CONFIG['IMP_PREFLANG'])
                            if "All" not in valid_langs:  # don't care about languages, accept all
                                try:
                                    # skip if language is not in valid list -
                                    booklang = book['lang']
                                    if booklang not in valid_langs:
                                        logger.debug(
                                            'Skipped %s with language %s' %
                                            (book['name'], booklang))
                                        ignored += 1
                                        continue
                                except KeyError:
                                    ignored += 1
                                    logger.debug(
                                        'Skipped %s where no language is found'
                                        % book['name'])
                                    continue

                            if authorname:
                                author_fuzz = fuzz.ratio(
                                    book['author'], authorname)
                            else:
                                author_fuzz = fuzz.ratio(
                                    book['author'], fullterm)

                            if title:
                                book_fuzz = fuzz.token_set_ratio(
                                    book['name'], title)
                                # lose a point for each extra word in the fuzzy matches so we get the closest match
                                words = len(getList(book['name']))
                                words -= len(getList(title))
                                book_fuzz -= abs(words)
                            else:
                                book_fuzz = fuzz.token_set_ratio(
                                    book['name'], fullterm)

                            isbn_fuzz = 0
                            if is_valid_isbn(fullterm):
                                isbn_fuzz = 100

                            highest_fuzz = max((author_fuzz + book_fuzz) / 2,
                                               isbn_fuzz)

                            dic = {':': '.', '"': '', '\'': ''}
                            bookname = replace_all(book['name'], dic)

                            bookname = unaccented(bookname)
                            bookname = bookname.strip()  # strip whitespace

                            AuthorID = ''
                            if book['author']:
                                match = myDB.match(
                                    'SELECT AuthorID FROM authors WHERE AuthorName=?',
                                    (book['author'].replace('"', '""'), ))
                                if match:
                                    AuthorID = match['AuthorID']

                            resultlist.append({
                                'authorname':
                                book['author'],
                                'authorid':
                                AuthorID,
                                'bookid':
                                item['id'],
                                'bookname':
                                bookname,
                                'booksub':
                                book['sub'],
                                'bookisbn':
                                book['isbn'],
                                'bookpub':
                                book['pub'],
                                'bookdate':
                                book['date'],
                                'booklang':
                                book['lang'],
                                'booklink':
                                book['link'],
                                'bookrate':
                                float(book['rate']),
                                'bookrate_count':
                                book['rate_count'],
                                'bookimg':
                                book['img'],
                                'bookpages':
                                book['pages'],
                                'bookgenre':
                                book['genre'],
                                'bookdesc':
                                book['desc'],
                                'author_fuzz':
                                author_fuzz,
                                'book_fuzz':
                                book_fuzz,
                                'isbn_fuzz':
                                isbn_fuzz,
                                'highest_fuzz':
                                highest_fuzz,
                                'num_reviews':
                                book['ratings']
                            })

                            resultcount += 1

                except KeyError:
                    break

                logger.debug(
                    "Returning %s result%s for (%s) with keyword: %s" %
                    (resultcount, plural(resultcount), api_value, searchterm))

            logger.debug("Found %s result%s" %
                         (total_count, plural(total_count)))
            logger.debug("Removed %s unwanted language result%s" %
                         (ignored, plural(ignored)))
            logger.debug("Removed %s book%s with no author" %
                         (no_author_count, plural(no_author_count)))
            logger.debug(
                'The Google Books API was hit %s time%s for searchterm: %s' %
                (api_hits, plural(api_hits), fullterm))
            queue.put(resultlist)

        except Exception:
            logger.error('Unhandled exception in GB.find_results: %s' %
                         traceback.format_exc())

Exemplo n.º 4

0

Exibir arquivo

Arquivo: gb.py Projeto: Deltron216/LazyLibrarian

    def find_results(self, authorname=None, queue=None):
        threading.currentThread().name = "GB-SEARCH"
        resultlist = []
        #See if we should check ISBN field, otherwise ignore it
        try:
            isbn_check = int(authorname[:-1])
            if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12):
                api_strings = ['isbn:']
            else:
                api_strings = ['inauthor:', 'intitle:']
        except:
            api_strings = ['inauthor:', 'intitle:']

        api_hits = 0
        logger.info('Now searching Google Books API with keyword: ' + self.name)

        for api_value in api_strings:
            startindex = 0
            if api_value == "isbn:":
                set_url = self.url + urllib.quote(api_value + self.name)
            else:
                set_url = self.url + urllib.quote(api_value + '"' + self.name + '"')

            try:
                startindex = 0
                resultcount = 0
                removedResults = 0
                ignored = 0

                total_count = 0
                no_author_count = 0
                while True:

                    self.params['startIndex'] = startindex
                    URL = set_url + '&' + urllib.urlencode(self.params)

                    try:
                        jsonresults = json.JSONDecoder().decode(urllib2.urlopen(URL, timeout=30).read())
                        api_hits = api_hits + 1
                        number_results = jsonresults['totalItems']
                        logger.debug('Searching url: ' + URL)
                        if number_results == 0:
                            logger.info('Found no results for %s with value: %s' % (api_value, self.name))
                            break
                        else:
                            pass
                    except HTTPError, err:
                        logger.warn('Google Books API Error [%s]: Check your API key or wait a while' % err.msg)
                        break

                    startindex = startindex+40

                    for item in jsonresults['items']:

                        total_count = total_count + 1

                        # skip if no author, no author is no book.
                        try:
                            Author = item['volumeInfo']['authors'][0]
                        except KeyError:
                            logger.debug('Skipped a result without authorfield.')
                            no_author_count = no_author_count + 1
                            continue

                        try:
                            #skip if language is in ignore list
                            booklang = item['volumeInfo']['language']
                            valid_langs = ([valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(',')])
                            if booklang not in valid_langs:
                                logger.debug('Skipped a book with language %s' % booklang)
                                ignored = ignored + 1
                                continue
                        except KeyError:
                            ignored = ignored+1
                            logger.debug('Skipped a result where no language is found')
                            continue

                        try:
                            bookpub = item['volumeInfo']['publisher']
                        except KeyError:
                            bookpub = None

                        try:
                            booksub = item['volumeInfo']['subtitle']
                        except KeyError:
                            booksub = None

                        try:
                            bookdate = item['volumeInfo']['publishedDate']
                        except KeyError:
                            bookdate = '0000-00-00'
                        bookdate = bookdate[:4]

                        try:
                            bookimg = item['volumeInfo']['imageLinks']['thumbnail']
                        except KeyError:
                            bookimg = 'images/nocover.png'

                        try:
                            bookrate = item['volumeInfo']['averageRating']
                        except KeyError:
                            bookrate = 0

                        try:
                            bookpages = item['volumeInfo']['pageCount']
                        except KeyError:
                            bookpages = '0'

                        try:
                            bookgenre = item['volumeInfo']['categories'][0]
                        except KeyError:
                            bookgenre = None

                        try:
                            bookdesc = item['volumeInfo']['description']
                        except KeyError:
                            bookdesc = 'Not available'

                        try:
                            num_reviews = item['volumeInfo']['ratingsCount']
                        except KeyError:
                            num_reviews = 0

                        try:
                            if item['volumeInfo']['industryIdentifiers'][0]['type'] == 'ISBN_10':
                                bookisbn = item['volumeInfo']['industryIdentifiers'][0]['identifier']
                            else:
                                bookisbn = 0
                        except KeyError:
                            bookisbn = 0

                        author_fuzz = fuzz.ratio(Author.lower(), authorname.lower())
                        book_fuzz = fuzz.ratio(item['volumeInfo']['title'].lower(), authorname.lower())
                        try:
                            isbn_check = int(authorname[:-1])
                            if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12):
                                isbn_fuzz = int(100)
                            else:
                                isbn_fuzz = int(0)
                        except:
                            isbn_fuzz = int(0)
                        highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz)

#  Darkie67:
#        replacing German Umlauts and filtering out ":"
#
                        booknamealt = item['volumeInfo']['title']
                        booknametmp1=booknamealt.replace(u'\xf6',u'oe')
                        booknametmp2=booknametmp1.replace(u'\xe4',u'ae')
                        booknametmp3=booknametmp2.replace(u'\xdf',u'ss')
                        booknametmp4=booknametmp3.replace(u'\xc4',u'Ae')
                        booknametmp5=booknametmp4.replace(u'\xdc',u'Ue')
                        booknametmp6=booknametmp5.replace(u'\xd6',u'Oe')
                        booknametmp7=booknametmp6.replace(':','')  
                        bookname=booknametmp7.replace(u'\xfc',u'ue')
# Darkie67 end                        
                        resultlist.append({
                            'authorname': Author,
                            'bookid': item['id'],
                            'bookname': bookname,
                            'booksub': booksub,
                            'bookisbn': bookisbn,
                            'bookpub': bookpub,
                            'bookdate': bookdate,
                            'booklang': booklang,
                            'booklink': item['volumeInfo']['canonicalVolumeLink'],
                            'bookrate': float(bookrate),
                            'bookimg': bookimg,
                            'bookpages': bookpages,
                            'bookgenre': bookgenre,
                            'bookdesc': bookdesc,
                            'author_fuzz': author_fuzz,
                            'book_fuzz': book_fuzz,
                            'isbn_fuzz': isbn_fuzz,
                            'highest_fuzz': highest_fuzz,
                            'num_reviews': num_reviews
                            })

                        resultcount = resultcount+1

                    if startindex >= number_results:
                        logger.debug("Found %s total results" % total_count)
                        logger.debug("Removed %s bad language results" % ignored)
                        logger.debug("Removed %s books with no author" % no_author_count)
                        logger.info("Showing %s results for (%s) with keyword: %s" % (resultcount, api_value, authorname))
                        break
                    else:
                        continue

            except KeyError:
                break

        logger.info('The Google Books API was hit %s times for keyword %s' % (str(api_hits), self.name))
        queue.put(resultlist)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: librarysync.py Projeto: duckville/LazyLibrarian

def LibraryScan(dir=None):
    if not dir:
        if not lazylibrarian.DOWNLOAD_DIR:
            return
        else:
            dir = lazylibrarian.DOWNLOAD_DIR

    if not os.path.isdir(dir):
        logger.warn("Cannot find directory: %s. Not scanning" % dir.decode(lazylibrarian.SYS_ENCODING, "replace"))
        return

    myDB = database.DBConnection()

    myDB.action("drop table if exists stats")
    myDB.action(
        "create table stats (authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, \
                            GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )"
    )

    logger.info("Scanning ebook directory: %s" % dir.decode(lazylibrarian.SYS_ENCODING, "replace"))

    new_book_count = 0
    file_count = 0

    if lazylibrarian.FULL_SCAN:
        books = myDB.select('select AuthorName, BookName, BookFile, BookID from books where Status="Open"')
        status = lazylibrarian.NOTFOUND_STATUS
        logger.info("Missing books will be marked as %s" % status)
        for book in books:
            bookName = book["BookName"]
            bookAuthor = book["AuthorName"]
            bookID = book["BookID"]
            bookfile = book["BookFile"]

            if not (bookfile and os.path.isfile(bookfile)):
                myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID))
                myDB.action('update books set BookFile="" where BookID="%s"' % bookID)
                logger.warn("Book %s - %s updated as not found on disk" % (bookAuthor, bookName))

    # guess this was meant to save repeat-scans of the same directory
    # if it contains multiple formats of the same book, but there was no code
    # that looked at the array. renamed from latest to processed to make
    # purpose clearer
    processed_subdirectories = []

    matchString = ""
    for char in lazylibrarian.EBOOK_DEST_FILE:
        matchString = matchString + "\\" + char
    # massage the EBOOK_DEST_FILE config parameter into something we can use
    # with regular expression matching
    booktypes = ""
    count = -1
    booktype_list = formatter.getList(lazylibrarian.EBOOK_TYPE)
    for book_type in booktype_list:
        count += 1
        if count == 0:
            booktypes = book_type
        else:
            booktypes = booktypes + "|" + book_type
    matchString = (
        matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace("\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)")
        + "\.["
        + booktypes
        + "]"
    )
    pattern = re.compile(matchString, re.VERBOSE)

    for r, d, f in os.walk(dir):
        for directory in d[:]:
            if directory.startswith("."):
                d.remove(directory)
            # prevent magazine being scanned
            if directory.startswith("_"):
                d.remove(directory)
        for files in f:
            file_count += 1
            subdirectory = r.replace(dir, "")
            # Added new code to skip if we've done this directory before.
            # Made this conditional with a switch in config.ini
            # in case user keeps multiple different books in the same subdirectory
            if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories):
                logger.debug("[%s] already scanned" % subdirectory)
            else:
                # If this is a book, try to get author/title/isbn/language
                # if epub or mobi, read metadata from the book
                # If metadata.opf exists, use that allowing it to override
                # embedded metadata. User may have edited metadata.opf
                # to merge author aliases together
                # If all else fails, try pattern match for author/title
                # and look up isbn/lang from LT or GR later
                match = 0

                if formatter.is_valid_booktype(files):
                    logger.debug(
                        "[%s] Now scanning subdirectory %s"
                        % (
                            dir.decode(lazylibrarian.SYS_ENCODING, "replace"),
                            subdirectory.decode(lazylibrarian.SYS_ENCODING, "replace"),
                        )
                    )
                    language = "Unknown"
                    isbn = ""
                    book = ""
                    author = ""
                    words = files.split(".")
                    extn = words[len(words) - 1]

                    # if it's an epub or a mobi we can try to read metadata from it
                    if (extn == "epub") or (extn == "mobi"):
                        book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING)
                        try:
                            res = get_book_info(book_filename)
                        except:
                            res = {}
                        if "title" in res and "creator" in res:  # this is the minimum we need
                            match = 1
                            book = res["title"]
                            author = res["creator"]
                            if "language" in res:
                                language = res["language"]
                            if "identifier" in res:
                                isbn = res["identifier"]
                            if "type" in res:
                                extn = res["type"]
                            logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" % (isbn, language, author, book, extn))
                        else:
                            logger.debug("Book meta incomplete in %s" % book_filename)

                    # calibre uses "metadata.opf", LL uses "bookname - authorname.opf"
                    # just look for any .opf file in the current directory since we don't know
                    # LL preferred authorname/bookname at this point.
                    # Allow metadata in file to override book contents as may be users pref
                    metafile = opf_file(r)
                    try:
                        res = get_book_info(metafile)
                    except:
                        res = {}
                    if "title" in res and "creator" in res:  # this is the minimum we need
                        match = 1
                        book = res["title"]
                        author = res["creator"]
                        if "language" in res:
                            language = res["language"]
                        if "identifier" in res:
                            isbn = res["identifier"]
                        logger.debug("file meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book))
                    else:
                        logger.debug("File meta incomplete in %s" % metafile)

                    if not match:  # no author/book from metadata file, and not embedded either
                        match = pattern.match(files)
                        if match:
                            author = match.group("author")
                            book = match.group("book")
                        else:
                            logger.debug("Pattern match failed [%s]" % files)

                    if match:
                        # flag that we found a book in this subdirectory
                        processed_subdirectories.append(subdirectory)

                        # If we have a valid looking isbn, and language != "Unknown", add it to cache
                        if language != "Unknown" and formatter.is_valid_isbn(isbn):
                            logger.debug("Found Language [%s] ISBN [%s]" % (language, isbn))
                            # we need to add it to language cache if not already
                            # there, is_valid_isbn has checked length is 10 or 13
                            if len(isbn) == 10:
                                isbnhead = isbn[0:3]
                            else:
                                isbnhead = isbn[3:6]
                            match = myDB.action('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone()
                            if not match:
                                myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, language))
                                logger.debug("Cached Lang [%s] ISBN [%s]" % (language, isbnhead))
                            else:
                                logger.debug("Already cached Lang [%s] ISBN [%s]" % (language, isbnhead))

                        # get authors name in a consistent format
                        if "," in author:  # "surname, forename"
                            words = author.split(",")
                            author = words[1].strip() + " " + words[0].strip()  # "forename surname"
                        if author[1] == " ":
                            author = author.replace(" ", ".")
                            author = author.replace("..", ".")

                        # Check if the author exists, and import the author if not,
                        # before starting any complicated book-name matching to save repeating the search
                        #
                        check_exist_author = myDB.action(
                            'SELECT * FROM authors where AuthorName="%s"' % author
                        ).fetchone()
                        if not check_exist_author and lazylibrarian.ADD_AUTHOR:
                            # no match for supplied author, but we're allowed to
                            # add new ones

                            GR = GoodReads(author)
                            try:
                                author_gr = GR.find_author_id()
                            except:
                                logger.warn("Error finding author id for [%s]" % author)
                                continue

                            # only try to add if GR data matches found author data
                            if author_gr:
                                authorname = author_gr["authorname"]

                                # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
                                match_auth = author.replace(".", "_")
                                match_auth = match_auth.replace(" ", "_")
                                match_auth = match_auth.replace("__", "_")
                                match_name = authorname.replace(".", "_")
                                match_name = match_name.replace(" ", "_")
                                match_name = match_name.replace("__", "_")
                                match_name = common.remove_accents(match_name)
                                match_auth = common.remove_accents(match_auth)
                                # allow a degree of fuzziness to cater for different accented character handling.
                                # some author names have accents,
                                # filename may have the accented or un-accented version of the character
                                # The currently non-configurable value of fuzziness might need to go in config
                                # We stored GoodReads unmodified author name in
                                # author_gr, so store in LL db under that
                                # fuzz.ratio doesn't lowercase for us
                                match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower())
                                if match_fuzz < 90:
                                    logger.debug("Failed to match author [%s] fuzz [%d]" % (author, match_fuzz))
                                    logger.debug(
                                        "Failed to match author [%s] to authorname [%s]" % (match_auth, match_name)
                                    )

                                # To save loading hundreds of books by unknown
                                # authors at GR or GB, ignore if author "Unknown"
                                if (author != "Unknown") and (match_fuzz >= 90):
                                    # use "intact" name for author that we stored in
                                    # GR author_dict, not one of the various mangled versions
                                    # otherwise the books appear to be by a different author!
                                    author = author_gr["authorname"]
                                    # this new authorname may already be in the
                                    # database, so check again
                                    check_exist_author = myDB.action(
                                        'SELECT * FROM authors where AuthorName="%s"' % author
                                    ).fetchone()
                                    if not check_exist_author:
                                        logger.debug("Adding new author [%s]" % author)
                                        try:
                                            importer.addAuthorToDB(author)
                                            check_exist_author = myDB.action(
                                                'SELECT * FROM authors where AuthorName="%s"' % author
                                            ).fetchone()
                                        except:
                                            continue

                        # check author exists in db, either newly loaded or already there
                        if not check_exist_author:
                            logger.debug("Failed to match author [%s] in database" % author)
                        else:
                            # author exists, check if this book by this author is in our database
                            # metadata might have quotes in book name
                            book = book.replace('"', "").replace("'", "")
                            bookid = find_book_in_db(myDB, author, book)
                            if bookid:
                                # check if book is already marked as "Open" (if so,
                                # we already had it)
                                check_status = myDB.action(
                                    'SELECT Status from books where BookID="%s"' % bookid
                                ).fetchone()
                                if check_status["Status"] != "Open":
                                    # update status as we've got this book
                                    myDB.action('UPDATE books set Status="Open" where BookID="%s"' % bookid)
                                    book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING)
                                    # update book location so we can check if it
                                    # gets removed, or allow click-to-open
                                    myDB.action(
                                        'UPDATE books set BookFile="%s" where BookID="%s"' % (book_filename, bookid)
                                    )
                                    new_book_count += 1

    cachesize = myDB.action("select count('ISBN') as counter from languages").fetchone()
    logger.info("%s new/modified books found and added to the database" % new_book_count)
    logger.info("%s files processed" % file_count)
    stats = myDB.action(
        "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \
            sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats"
    ).fetchone()
    if stats["sum(GR_book_hits)"] is not None:
        # only show stats if new books added
        if lazylibrarian.BOOK_API == "GoogleBooks":
            logger.debug("GoogleBooks was hit %s times for books" % stats["sum(GR_book_hits)"])
            logger.debug("GoogleBooks language was changed %s times" % stats["sum(GB_lang_change)"])
        if lazylibrarian.BOOK_API == "GoodReads":
            logger.debug("GoodReads was hit %s times for books" % stats["sum(GR_book_hits)"])
            logger.debug("GoodReads was hit %s times for languages" % stats["sum(GR_lang_hits)"])
        logger.debug("LibraryThing was hit %s times for languages" % stats["sum(LT_lang_hits)"])
        logger.debug("Language cache was hit %s times" % stats["sum(cache_hits)"])
        logger.debug("Unwanted language removed %s books" % stats["sum(bad_lang)"])
        logger.debug("Unwanted characters removed %s books" % stats["sum(bad_char)"])
        logger.debug("Unable to cache %s books with missing ISBN" % stats["sum(uncached)"])
    logger.debug("Cache %s hits, %s miss" % (lazylibrarian.CACHE_HIT, lazylibrarian.CACHE_MISS))
    logger.debug("ISBN Language cache holds %s entries" % cachesize["counter"])
    stats = len(myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"'))
    if stats:
        logger.warn("There are %s books in your library with unknown language" % stats)

    authors = myDB.select("select AuthorName from authors")
    # Update bookcounts for all authors, not just new ones - refresh may have located
    # new books for existing authors especially if switched provider gb/gr
    logger.debug("Updating bookcounts for %i authors" % len(authors))
    for author in authors:
        name = author["AuthorName"]
        havebooks = myDB.action(
            'SELECT count("BookID") as counter from books WHERE AuthorName="%s" AND (Status="Have" OR Status="Open")'
            % name
        ).fetchone()
        myDB.action('UPDATE authors set HaveBooks="%s" where AuthorName="%s"' % (havebooks["counter"], name))
        totalbooks = myDB.action(
            'SELECT count("BookID") as counter FROM books WHERE AuthorName="%s" AND Status!="Ignored"' % name
        ).fetchone()
        myDB.action('UPDATE authors set UnignoredBooks="%s" where AuthorName="%s"' % (totalbooks["counter"], name))

    logger.info("Library scan complete")

Exemplo n.º 6

0

Exibir arquivo

Arquivo: librarysync.py Projeto: Finch106/LazyLibrarian

def find_book_in_db(myDB, author, book):
    # PAB fuzzy search for book in library, return LL bookid if found or zero
    # if not, return bookid to more easily update status
    # prefer an exact match on author & book
    match = myDB.match(
        'SELECT BookID FROM books where AuthorName="%s" and BookName="%s"' %
        (author.replace('"', '""'), book.replace('"', '""')))
    if match:
        logger.debug('Exact match [%s]' % book)
        return match['BookID']
    else:
        # No exact match
        # Try a more complex fuzzy match against each book in the db by this author
        # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>75)
        # These are results that work well on my library, minimal false matches and no misses
        # on books that should be matched
        # Maybe make ratios configurable in config.ini later

        books = myDB.select(
            'SELECT BookID,BookName FROM books where AuthorName="%s"' %
            author.replace('"', '""'))

        best_ratio = 0
        best_partial = 0
        best_partname = 0
        ratio_name = ""
        partial_name = ""
        partname_name = ""
        ratio_id = 0
        partial_id = 0
        partname_id = 0
        partname = 0

        book_lower = unaccented(book.lower())
        book_partname = ''
        if ':' in book_lower:
            book_partname = book_lower.split(':')[0]

        for a_book in books:
            # tidy up everything to raise fuzziness scores
            # still need to lowercase for matching against partial_name later on
            a_book_lower = unaccented(a_book['BookName'].lower())
            #
            ratio = fuzz.ratio(book_lower, a_book_lower)
            partial = fuzz.partial_ratio(book_lower, a_book_lower)
            if book_partname:
                partname = fuzz.partial_ratio(book_partname, a_book_lower)

            # lose a point for each extra word in the fuzzy matches so we get the closest match
            words = len(getList(book_lower))
            words -= len(getList(a_book_lower))
            ratio -= abs(words)
            partial -= abs(words)

            if ratio > best_ratio:
                best_ratio = ratio
                ratio_name = a_book['BookName']
                ratio_id = a_book['BookID']
            if partial > best_partial:
                best_partial = partial
                partial_name = a_book['BookName']
                partial_id = a_book['BookID']
            if partname > best_partname:
                best_partname = partname
                partname_name = a_book['BookName']
                partname_id = a_book['BookID']

            if partial == best_partial:
                # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest
                # this eliminates most false matches against omnibuses
                # find the position of the shortest string in the longest
                if len(getList(book_lower)) >= len(getList(a_book_lower)):
                    match1 = book_lower.find(a_book_lower)
                else:
                    match1 = a_book_lower.find(book_lower)

                if len(getList(book_lower)) >= len(
                        getList(partial_name.lower())):
                    match2 = book_lower.find(partial_name.lower())
                else:
                    match2 = partial_name.lower().find(book_lower)

                if match1 < match2:
                    logger.debug(
                        "Fuzz left change, prefer [%s] over [%s] for [%s]" %
                        (a_book['BookName'], partial_name, book))
                    best_partial = partial
                    partial_name = a_book['BookName']
                    partial_id = a_book['BookID']

        if best_ratio > 90:
            logger.debug("Fuzz match   ratio [%d] [%s] [%s]" %
                         (best_ratio, book, ratio_name))
            return ratio_id
        if best_partial > 75:
            logger.debug("Fuzz match partial [%d] [%s] [%s]" %
                         (best_partial, book, partial_name))
            return partial_id
        if best_partname > 95:
            logger.debug("Fuzz match partname [%d] [%s] [%s]" %
                         (best_partname, book, partname_name))
            return partname_id

        logger.debug(
            'Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s], partname [%d,%s]'
            % (author, book, best_ratio, ratio_name, best_partial,
               partial_name, best_partname, partname_name))
        return 0

Exemplo n.º 7

0

Exibir arquivo

    def find_results(self, authorname=None, queue=None):
        threading.currentThread().name = "GB-SEARCH"
        resultlist = []
        #See if we should check ISBN field, otherwise ignore it
        try:
            isbn_check = int(authorname[:-1])
            if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12):
                api_strings = ['isbn:']
            else:
                api_strings = ['inauthor:', 'intitle:']
        except:
            api_strings = ['inauthor:', 'intitle:']

        api_hits = 0
        logger.info('Now searching Google Books API with keyword: ' +
                    self.name)

        for api_value in api_strings:
            startindex = 0
            if api_value == "isbn:":
                set_url = self.url + urllib.quote(api_value + self.name)
            else:
                set_url = self.url + urllib.quote(api_value + '"' + self.name +
                                                  '"')

            try:
                startindex = 0
                resultcount = 0
                removedResults = 0
                ignored = 0

                total_count = 0
                no_author_count = 0
                while True:

                    self.params['startIndex'] = startindex
                    URL = set_url + '&' + urllib.urlencode(self.params)

                    try:
                        jsonresults = json.JSONDecoder().decode(
                            urllib2.urlopen(URL, timeout=30).read())
                        api_hits = api_hits + 1
                        number_results = jsonresults['totalItems']
                        logger.debug('Searching url: ' + URL)
                        if number_results == 0:
                            logger.info(
                                'Found no results for %s with value: %s' %
                                (api_value, self.name))
                            break
                        else:
                            pass
                    except HTTPError, err:
                        logger.warn(
                            'Google Books API Error [%s]: Check your API key or wait a while'
                            % err.msg)
                        break

                    startindex = startindex + 40

                    for item in jsonresults['items']:

                        total_count = total_count + 1

                        # skip if no author, no author is no book.
                        try:
                            Author = item['volumeInfo']['authors'][0]
                        except KeyError:
                            logger.debug(
                                'Skipped a result without authorfield.')
                            no_author_count = no_author_count + 1
                            continue

                        try:
                            #skip if language is in ignore list
                            booklang = item['volumeInfo']['language']
                            valid_langs = ([
                                valid_lang.strip() for valid_lang in
                                lazylibrarian.IMP_PREFLANG.split(',')
                            ])
                            if booklang not in valid_langs:
                                logger.debug(
                                    'Skipped a book with language %s' %
                                    booklang)
                                ignored = ignored + 1
                                continue
                        except KeyError:
                            ignored = ignored + 1
                            logger.debug(
                                'Skipped a result where no language is found')
                            continue

                        try:
                            bookpub = item['volumeInfo']['publisher']
                        except KeyError:
                            bookpub = None

                        try:
                            booksub = item['volumeInfo']['subtitle']
                        except KeyError:
                            booksub = None

                        try:
                            bookdate = item['volumeInfo']['publishedDate']
                        except KeyError:
                            bookdate = '0000-00-00'
                        bookdate = bookdate[:4]

                        try:
                            bookimg = item['volumeInfo']['imageLinks'][
                                'thumbnail']
                        except KeyError:
                            bookimg = 'images/nocover.png'

                        try:
                            bookrate = item['volumeInfo']['averageRating']
                        except KeyError:
                            bookrate = 0

                        try:
                            bookpages = item['volumeInfo']['pageCount']
                        except KeyError:
                            bookpages = '0'

                        try:
                            bookgenre = item['volumeInfo']['categories'][0]
                        except KeyError:
                            bookgenre = None

                        try:
                            bookdesc = item['volumeInfo']['description']
                        except KeyError:
                            bookdesc = 'Not available'

                        try:
                            num_reviews = item['volumeInfo']['ratingsCount']
                        except KeyError:
                            num_reviews = 0

                        try:
                            if item['volumeInfo']['industryIdentifiers'][0][
                                    'type'] == 'ISBN_10':
                                bookisbn = item['volumeInfo'][
                                    'industryIdentifiers'][0]['identifier']
                            else:
                                bookisbn = 0
                        except KeyError:
                            bookisbn = 0

                        author_fuzz = fuzz.ratio(Author.lower(),
                                                 authorname.lower())
                        book_fuzz = fuzz.ratio(
                            item['volumeInfo']['title'].lower(),
                            authorname.lower())
                        try:
                            isbn_check = int(authorname[:-1])
                            if (len(str(isbn_check)) == 9) or (len(
                                    str(isbn_check)) == 12):
                                isbn_fuzz = int(100)
                            else:
                                isbn_fuzz = int(0)
                        except:
                            isbn_fuzz = int(0)
                        highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz)

                        resultlist.append({
                            'authorname':
                            Author,
                            'bookid':
                            item['id'],
                            'bookname':
                            item['volumeInfo']['title'],
                            'booksub':
                            booksub,
                            'bookisbn':
                            bookisbn,
                            'bookpub':
                            bookpub,
                            'bookdate':
                            bookdate,
                            'booklang':
                            booklang,
                            'booklink':
                            item['volumeInfo']['canonicalVolumeLink'],
                            'bookrate':
                            float(bookrate),
                            'bookimg':
                            bookimg,
                            'bookpages':
                            bookpages,
                            'bookgenre':
                            bookgenre,
                            'bookdesc':
                            bookdesc,
                            'author_fuzz':
                            author_fuzz,
                            'book_fuzz':
                            book_fuzz,
                            'isbn_fuzz':
                            isbn_fuzz,
                            'highest_fuzz':
                            highest_fuzz,
                            'num_reviews':
                            num_reviews
                        })

                        resultcount = resultcount + 1

                    if startindex >= number_results:
                        logger.debug("Found %s total results" % total_count)
                        logger.debug("Removed %s bad language results" %
                                     ignored)
                        logger.debug("Removed %s books with no author" %
                                     no_author_count)
                        logger.info(
                            "Showing %s results for (%s) with keyword: %s" %
                            (resultcount, api_value, authorname))
                        break
                    else:
                        continue

            except KeyError:
                break

        logger.info('The Google Books API was hit %s times for keyword %s' %
                    (str(api_hits), self.name))
        queue.put(resultlist)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: librarysync.py Projeto: NVRemoteDev/LazyLibrarian

def LibraryScan(dir=None):
    if not dir:
        if not lazylibrarian.DOWNLOAD_DIR:
            return
        else:
            dir = lazylibrarian.DOWNLOAD_DIR

    if not os.path.isdir(dir):
        logger.warn(
            'Cannot find directory: %s. Not scanning' %
            dir.decode(lazylibrarian.SYS_ENCODING, 'replace'))
        return

    myDB = database.DBConnection()

    myDB.action('drop table if exists stats')
    myDB.action(
        'create table stats (authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, \
                            GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )')

    logger.info(
        'Scanning ebook directory: %s' %
        dir.decode(lazylibrarian.SYS_ENCODING, 'replace'))

    new_book_count = 0
    file_count = 0

    if lazylibrarian.FULL_SCAN:
        books = myDB.select(
            'select AuthorName, BookName, BookFile, BookID from books where Status="Open"')
        status = lazylibrarian.NOTFOUND_STATUS
        logger.info('Missing books will be marked as %s' % status)
        for book in books:
            bookName = book['BookName']
            bookAuthor = book['AuthorName']
            bookID = book['BookID']
            bookfile = book['BookFile']

            if not(bookfile and os.path.isfile(bookfile)):
                myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID))
                myDB.action('update books set BookFile="" where BookID="%s"' % bookID)
                logger.warn('Book %s - %s updated as not found on disk' % (bookAuthor, bookName))
        
    # to save repeat-scans of the same directory if it contains multiple formats of the same book, 
    # keep track of which directories we've already looked at 
    processed_subdirectories = []

    matchString = ''
    for char in lazylibrarian.EBOOK_DEST_FILE:
        matchString = matchString + '\\' + char
    # massage the EBOOK_DEST_FILE config parameter into something we can use
    # with regular expression matching
    booktypes = ''
    count = -1
    booktype_list = formatter.getList(lazylibrarian.EBOOK_TYPE)
    for book_type in booktype_list:
        count += 1
        if count == 0:
            booktypes = book_type
        else:
            booktypes = booktypes + '|' + book_type
    matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace(
        "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']'
    pattern = re.compile(matchString, re.VERBOSE)

    for r, d, f in os.walk(dir):
        for directory in d[:]:
            if directory.startswith("."):
                d.remove(directory)
            # prevent magazine being scanned
            if directory.startswith("_"):
                d.remove(directory)

        for files in f:
            file_count += 1

            if isinstance(r, str):
                r = r.decode('utf-8')

            subdirectory = r.replace(dir, '')
            # Added new code to skip if we've done this directory before.
            # Made this conditional with a switch in config.ini
            # in case user keeps multiple different books in the same subdirectory
            if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories):
                logger.debug("[%s] already scanned" % subdirectory)
            else:
                # If this is a book, try to get author/title/isbn/language
                # if epub or mobi, read metadata from the book
                # If metadata.opf exists, use that allowing it to override
                # embedded metadata. User may have edited metadata.opf
                # to merge author aliases together
                # If all else fails, try pattern match for author/title
                # and look up isbn/lang from LT or GR later
                match = 0
                if formatter.is_valid_booktype(files):

                    logger.debug("[%s] Now scanning subdirectory %s" %
                                 (dir, subdirectory))

                    language = "Unknown"
                    isbn = ""
                    book = ""
                    author = ""
                    words = files.split('.')
                    extn = words[len(words) - 1]

                    # if it's an epub or a mobi we can try to read metadata from it
                    if (extn == "epub") or (extn == "mobi"):
                        book_filename = os.path.join(
                            r.encode(lazylibrarian.SYS_ENCODING), files.encode(lazylibrarian.SYS_ENCODING))

                        try:
                            res = get_book_info(book_filename)
                        except:
                            res = {}
                        if 'title' in res and 'creator' in res:  # this is the minimum we need
                            match = 1
                            book = res['title']
                            author = res['creator']
                            if 'language' in res:
                                language = res['language']
                            if 'identifier' in res:
                                isbn = res['identifier']
                            if 'type' in res:
                                extn = res['type']

                            logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" %
                                         (isbn, language, author, book, extn))
                        else:

                            logger.debug("Book meta incomplete in %s" % book_filename)

                    # calibre uses "metadata.opf", LL uses "bookname - authorname.opf"
                    # just look for any .opf file in the current directory since we don't know
                    # LL preferred authorname/bookname at this point.
                    # Allow metadata in file to override book contents as may be users pref

                    metafile = opf_file(r)
                    try:
                        res = get_book_info(metafile)
                    except:
                        res = {}
                    if 'title' in res and 'creator' in res:  # this is the minimum we need
                        match = 1
                        book = res['title']
                        author = res['creator']
                        if 'language' in res:
                            language = res['language']
                        if 'identifier' in res:
                            isbn = res['identifier']
                        logger.debug(
                            "file meta [%s] [%s] [%s] [%s]" %
                            (isbn, language, author, book))
                    else:
                        logger.debug("File meta incomplete in %s" % metafile)

                    if not match:  # no author/book from metadata file, and not embedded either
                        match = pattern.match(files)
                        if match:
                            author = match.group("author")
                            book = match.group("book")
                        else:
                            logger.debug("Pattern match failed [%s]" % files)

                    if match:
                        # flag that we found a book in this subdirectory
                        processed_subdirectories.append(subdirectory)

                        # If we have a valid looking isbn, and language != "Unknown", add it to cache
                        if language != "Unknown" and formatter.is_valid_isbn(isbn):
                            logger.debug(
                                "Found Language [%s] ISBN [%s]" %
                                (language, isbn))
                            # we need to add it to language cache if not already
                            # there, is_valid_isbn has checked length is 10 or 13
                            if len(isbn) == 10:
                                isbnhead = isbn[0:3]
                            else:
                                isbnhead = isbn[3:6]
                            match = myDB.action(
                                'SELECT lang FROM languages where isbn = "%s"' %
                                (isbnhead)).fetchone()
                            if not match:
                                myDB.action(
                                    'insert into languages values ("%s", "%s")' %
                                    (isbnhead, language))
                                logger.debug(
                                    "Cached Lang [%s] ISBN [%s]" %
                                    (language, isbnhead))
                            else:
                                logger.debug(
                                    "Already cached Lang [%s] ISBN [%s]" %
                                    (language, isbnhead))

                        # get authors name in a consistent format
                        if "," in author:  # "surname, forename"
                            words = author.split(',')
                            author = words[1].strip() + ' ' + words[0].strip()  # "forename surname"
                        if author[1] == ' ':
                            author = author.replace(' ', '.')
                            author = author.replace('..', '.')

                        # Check if the author exists, and import the author if not,
                        # before starting any complicated book-name matching to save repeating the search
                        #
                        check_exist_author = myDB.action(
                            'SELECT * FROM authors where AuthorName="%s"' %
                            author).fetchone()
                        if not check_exist_author and lazylibrarian.ADD_AUTHOR:
                            # no match for supplied author, but we're allowed to
                            # add new ones

                            GR = GoodReads(author)
                            try:
                                author_gr = GR.find_author_id()
                            except:
                                logger.warn(
                                    "Error finding author id for [%s]" %
                                    author)
                                continue

                            # only try to add if GR data matches found author data
                            if author_gr:
                                authorname = author_gr['authorname']

                                # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
                                match_auth = author.replace('.', '_')
                                match_auth = match_auth.replace(' ', '_')
                                match_auth = match_auth.replace('__', '_')
                                match_name = authorname.replace('.', '_')
                                match_name = match_name.replace(' ', '_')
                                match_name = match_name.replace('__', '_')
                                match_name = common.remove_accents(match_name)
                                match_auth = common.remove_accents(match_auth)
                                # allow a degree of fuzziness to cater for different accented character handling.
                                # some author names have accents,
                                # filename may have the accented or un-accented version of the character
                                # The currently non-configurable value of fuzziness might need to go in config
                                # We stored GoodReads unmodified author name in
                                # author_gr, so store in LL db under that
                                # fuzz.ratio doesn't lowercase for us
                                match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower())
                                if match_fuzz < 90:
                                    logger.debug(
                                        "Failed to match author [%s] fuzz [%d]" %
                                        (author, match_fuzz))
                                    logger.debug(
                                        "Failed to match author [%s] to authorname [%s]" %
                                        (match_auth, match_name))

                                # To save loading hundreds of books by unknown
                                # authors at GR or GB, ignore if author "Unknown"
                                if (author != "Unknown") and (match_fuzz >= 90):
                                    # use "intact" name for author that we stored in
                                    # GR author_dict, not one of the various mangled versions
                                    # otherwise the books appear to be by a different author!
                                    author = author_gr['authorname']
                                    # this new authorname may already be in the
                                    # database, so check again
                                    check_exist_author = myDB.action(
                                        'SELECT * FROM authors where AuthorName="%s"' %
                                        author).fetchone()
                                    if not check_exist_author:
                                        logger.debug(
                                            "Adding new author [%s]" %
                                            author)
                                        try:
                                            importer.addAuthorToDB(author)
                                            check_exist_author = myDB.action(
                                                'SELECT * FROM authors where AuthorName="%s"' %
                                                author).fetchone()
                                        except:
                                            continue

                        # check author exists in db, either newly loaded or already there
                        if not check_exist_author:
                            logger.debug(
                                "Failed to match author [%s] in database" %
                                author)
                        else:
                            # author exists, check if this book by this author is in our database
                            # metadata might have quotes in book name
                            book = book.replace('"', '').replace("'", "")
                            bookid = find_book_in_db(myDB, author, book)

                            if bookid:
                                # check if book is already marked as "Open" (if so,
                                # we already had it)

                                check_status = myDB.action(
                                    'SELECT Status from books where BookID="%s"' %
                                    bookid).fetchone()
                                if check_status['Status'] != 'Open':
                                    # update status as we've got this book

                                    myDB.action(
                                        'UPDATE books set Status="Open" where BookID="%s"' %
                                        bookid)

                                    book_filename = os.path.join(r, files)

                                    # update book location so we can check if it
                                    # gets removed, or allow click-to-open

                                    myDB.action(
                                        'UPDATE books set BookFile="%s" where BookID="%s"' %
                                        (book_filename, bookid))

                                    new_book_count += 1

    cachesize = myDB.action("select count('ISBN') as counter from languages").fetchone()
    logger.info(
        "%s new/modified books found and added to the database" %
        new_book_count)
    logger.info("%s files processed" % file_count)
    stats = myDB.action(
        "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \
            sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats").fetchone()
    if stats['sum(GR_book_hits)'] is not None:
        # only show stats if new books added
        if lazylibrarian.BOOK_API == "GoogleBooks":
            logger.debug(
                "GoogleBooks was hit %s times for books" %
                stats['sum(GR_book_hits)'])
            logger.debug(
                "GoogleBooks language was changed %s times" %
                stats['sum(GB_lang_change)'])
        if lazylibrarian.BOOK_API == "GoodReads":
            logger.debug(
                "GoodReads was hit %s times for books" %
                stats['sum(GR_book_hits)'])
            logger.debug(
                "GoodReads was hit %s times for languages" %
                stats['sum(GR_lang_hits)'])
        logger.debug(
            "LibraryThing was hit %s times for languages" %
            stats['sum(LT_lang_hits)'])
        logger.debug(
            "Language cache was hit %s times" %
            stats['sum(cache_hits)'])
        logger.debug(
            "Unwanted language removed %s books" %
            stats['sum(bad_lang)'])
        logger.debug(
            "Unwanted characters removed %s books" %
            stats['sum(bad_char)'])
        logger.debug(
            "Unable to cache %s books with missing ISBN" %
            stats['sum(uncached)'])
    logger.debug("Cache %s hits, %s miss" % (lazylibrarian.CACHE_HIT, lazylibrarian.CACHE_MISS))
    logger.debug("ISBN Language cache holds %s entries" % cachesize['counter'])
    stats = len(myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"'))
    if stats:
        logger.warn("There are %s books in your library with unknown language" % stats)

    authors = myDB.select('select AuthorName from authors')
    # Update bookcounts for all authors, not just new ones - refresh may have located
    # new books for existing authors especially if switched provider gb/gr
    logger.debug('Updating bookcounts for %i authors' % len(authors))
    for author in authors:
        name = author['AuthorName']
        havebooks = myDB.action(
            'SELECT count("BookID") as counter from books WHERE AuthorName="%s" AND (Status="Have" OR Status="Open")' %
            name).fetchone()
        myDB.action('UPDATE authors set HaveBooks="%s" where AuthorName="%s"' % (havebooks['counter'], name))
        totalbooks = myDB.action(
            'SELECT count("BookID") as counter FROM books WHERE AuthorName="%s"' % name).fetchone()        
        myDB.action('UPDATE authors set TotalBooks="%s" where AuthorName="%s"' % (totalbooks['counter'], name))
        unignoredbooks = myDB.action(
            'SELECT count("BookID") as counter FROM books WHERE AuthorName="%s" AND Status!="Ignored"' %
            name).fetchone()
        myDB.action('UPDATE authors set UnignoredBooks="%s" where AuthorName="%s"' % (unignoredbooks['counter'], name))

    covers = myDB.action("select  count('bookimg') as counter from books where bookimg like 'http%'").fetchone()
    logger.info("Caching covers for %s books" % covers['counter'])

    images = myDB.action('select bookid, bookimg, bookname from books where bookimg like "http%"')
    for item in images:
        bookid = item['bookid']
        bookimg = item['bookimg']
        bookname = item['bookname']
        newimg = bookwork.cache_cover(bookid, bookimg)
        if newimg != bookimg:
            myDB.action('update books set BookImg="%s" where BookID="%s"' % (newimg, bookid))
    logger.info('Library scan complete')

Exemplo n.º 9

0

Exibir arquivo

Arquivo: librarysync.py Projeto: MaDTaZ/LazyLibrarian

def LibraryScan(dir=None):
    if not dir:
        if not lazylibrarian.DOWNLOAD_DIR:
            return
        else:
            dir = lazylibrarian.DOWNLOAD_DIR

    if not os.path.isdir(dir):
        logger.warn("Cannot find directory: %s. Not scanning" % dir.decode(lazylibrarian.SYS_ENCODING, "replace"))
        return

    myDB = database.DBConnection()

    myDB.action("drop table if exists stats")
    myDB.action(
        "create table stats ( authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )"
    )

    new_authors = []

    logger.info("Scanning ebook directory: %s" % dir.decode(lazylibrarian.SYS_ENCODING, "replace"))

    book_list = []
    new_book_count = 0
    file_count = 0
    book_exists = False

    if lazylibrarian.FULL_SCAN:
        books = myDB.select("select AuthorName, BookName, BookFile, BookID from books where Status=?", [u"Open"])
        status = lazylibrarian.NOTFOUND_STATUS
        logger.info("Missing books will be marked as %s" % status)
        for book in books:
            bookName = book["BookName"]
            bookAuthor = book["AuthorName"]
            bookID = book["BookID"]
            bookfile = book["BookFile"]

            if os.path.isfile(bookfile):
                book_exists = True
            else:
                myDB.action("update books set Status=? where BookID=?", [status, bookID])
                myDB.action('update books set BookFile="" where BookID=?', [bookID])
                logger.info("Book %s updated as not found on disk" % bookfile)
                # for book_type in getList(lazylibrarian.EBOOK_TYPE):
                # 	bookName = book['BookName']
                # 	bookAuthor = book['AuthorName']
                # 	#Default destination path, should be allowed change per config file.
                # 	dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', bookAuthor).replace('$Title', bookName)
                # 	#dest_path = authorname+'/'+bookname
                # 	global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', bookAuthor).replace('$Title', bookName)
                #
                # 	encoded_book_path = os.path.join(dir,dest_path,global_name + "." + book_type).encode(lazylibrarian.SYS_ENCODING)
                # 	if os.path.isfile(encoded_book_path):
                # 		book_exists = True
                # if not book_exists:
                # 	myDB.action('update books set Status=? where AuthorName=? and BookName=?',[status,bookAuthor,bookName])
                # 	logger.info('Book %s updated as not found on disk' % encoded_book_path.decode(lazylibrarian.SYS_ENCODING, 'replace') )
                if bookAuthor not in new_authors:
                    new_authors.append(bookAuthor)

                # guess this was meant to save repeat-scans of the same directory
                # if it contains multiple formats of the same book, but there was no code
                # that looked at the array. renamed from latest to processed to make purpose clearer
    processed_subdirectories = []

    matchString = ""
    for char in lazylibrarian.EBOOK_DEST_FILE:
        matchString = matchString + "\\" + char
    # massage the EBOOK_DEST_FILE config parameter into something we can use with regular expression matching
    booktypes = ""
    count = -1
    booktype_list = getList(lazylibrarian.EBOOK_TYPE)
    for book_type in booktype_list:
        count += 1
        if count == 0:
            booktypes = book_type
        else:
            booktypes = booktypes + "|" + book_type
    matchString = (
        matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace("\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)")
        + "\.["
        + booktypes
        + "]"
    )
    pattern = re.compile(matchString, re.VERBOSE)

    for r, d, f in os.walk(dir):
        for directory in d[:]:
            if directory.startswith("."):
                d.remove(directory)
                # prevent magazine being scanned
            if directory.startswith("_"):
                d.remove(directory)
        for files in f:
            file_count += 1
            subdirectory = r.replace(dir, "")
            # Added new code to skip if we've done this directory before. Made this conditional with a switch in config.ini
            # in case user keeps multiple different books in the same subdirectory
            if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories):
                logger.debug("[%s] already scanned" % subdirectory)
            else:
                logger.info(
                    "[%s] Now scanning subdirectory %s"
                    % (
                        dir.decode(lazylibrarian.SYS_ENCODING, "replace"),
                        subdirectory.decode(lazylibrarian.SYS_ENCODING, "replace"),
                    )
                )

                # 			If this is a book, try to get author/title/isbn/language
                # 			If metadata.opf exists, use that
                # 			else if epub or mobi, read metadata from the book
                # 			else have to try pattern match for author/title	and look up isbn/lang from LT or GR later
                #
                # 			Is it a book (extension found in booktypes)
                match = 0
                words = files.split(".")
                extn = words[len(words) - 1]
                if extn in booktypes:
                    # see if there is a metadata file in this folder with the info we need
                    try:
                        metafile = os.path.join(r, "metadata.opf").encode(lazylibrarian.SYS_ENCODING)
                        res = get_book_info(metafile)
                        if res:
                            book = res["title"]
                            author = res["creator"]
                            language = res["language"]
                            isbn = res["identifier"]
                            match = 1
                            logger.debug("file meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book))

                    except:
                        logger.debug("No metadata file in %s" % r)

                    if not match:
                        # it's a book, but no external metadata found
                        # if it's an epub or a mobi we can try to read metadata from it
                        if (extn == "epub") or (extn == "mobi"):
                            book_file = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING)
                            res = get_book_info(book_file)
                            if res:
                                book = res["title"]
                                author = res["creator"]
                                language = res["language"]
                                isbn = res["identifier"]
                                match = 1
                                logger.debug("book meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book))

                if not match:
                    match = pattern.match(files)
                    if match:
                        author = match.group("author")
                        book = match.group("book")
                    else:
                        logger.debug("Pattern match failed [%s]" % files)

                else:
                    processed_subdirectories.append(subdirectory)  # flag that we found a book in this subdirectory
                    #
                    # If we have a valid looking isbn, and language != "Unknown", add it to cache
                    #
                    if not language:
                        language = "Unknown"

                        # strip any formatting from the isbn
                    isbn = re.sub("[- ]", "", isbn)
                    if len(isbn) != 10 and len(isbn) != 13:
                        isbn = ""
                    if not isbn.isdigit():
                        isbn = ""
                    if isbn != "" and language != "Unknown":
                        logger.debug("Found Language [%s] ISBN [%s]" % (language, isbn))
                        # we need to add it to language cache if not already there
                        if len(isbn) == 10:
                            isbnhead = isbn[0:3]
                        else:
                            isbnhead = isbn[3:6]
                        match = myDB.action('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone()
                        if not match:
                            myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, language))
                            logger.debug("Cached Lang [%s] ISBN [%s]" % (language, isbnhead))
                        else:
                            logger.debug("Already cached Lang [%s] ISBN [%s]" % (language, isbnhead))

                            # get authors name in a consistent format
                    if "," in author:  # "surname, forename"
                        words = author.split(",")
                        author = words[1].strip() + " " + words[0].strip()  # "forename surname"
                    author = author.replace(". ", " ")
                    author = author.replace(".", " ")
                    author = author.replace("  ", " ")

                    # Check if the author exists, and import the author if not,
                    # before starting any complicated book-name matching to save repeating the search
                    #
                    check_exist_author = myDB.action("SELECT * FROM authors where AuthorName=?", [author]).fetchone()
                    if not check_exist_author and lazylibrarian.ADD_AUTHOR:
                        # no match for supplied author, but we're allowed to add new ones

                        GR = GoodReads(author)
                        try:
                            author_gr = GR.find_author_id()
                        except:
                            logger.error("Error finding author id for [%s]" % author)
                            continue

                            # only try to add if GR data matches found author data
                            # not sure what this is for, never seems to fail??
                        if author_gr:
                            authorname = author_gr["authorname"]

                            # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
                            match_auth = author.replace(".", "_")
                            match_auth = match_auth.replace(" ", "_")
                            match_auth = match_auth.replace("__", "_")
                            match_name = authorname.replace(".", "_")
                            match_name = match_name.replace(" ", "_")
                            match_name = match_name.replace("__", "_")

                            # allow a degree of fuzziness to cater for different accented character handling.
                            # some author names have accents,
                            # filename may have the accented or un-accented version of the character
                            # The (currently non-configurable) value of fuzziness works for one accented character
                            # We stored GoodReads unmodified author name in author_gr, so store in LL db under that
                            match_fuzz = fuzz.ratio(match_auth, match_name)
                            if match_fuzz < 90:
                                logger.info("Failed to match author [%s] fuzz [%d]" % (author, match_fuzz))
                                logger.info("match author [%s] authorname [%s]" % (match_auth, match_name))

                                # To save loading hundreds of books by unknown authors at GR or GB, ignore if author "Unknown"
                            if (author != "Unknown") and (match_fuzz >= 90):
                                # use "intact" name for author that we stored in
                                # GR author_dict, not one of the various mangled versions
                                # otherwise the books appear to be by a different author!
                                author = author_gr["authorname"]
                                # this new authorname may already be in the database, so check again
                                check_exist_author = myDB.action(
                                    "SELECT * FROM authors where AuthorName=?", [author]
                                ).fetchone()
                                if not check_exist_author:
                                    logger.info("Adding new author [%s]" % author)
                                    if author not in new_authors:
                                        new_authors.append(author)
                                    try:
                                        importer.addAuthorToDB(author)
                                        check_exist_author = myDB.action(
                                            "SELECT * FROM authors where AuthorName=?", [author]
                                        ).fetchone()
                                    except:
                                        continue

                                        # check author exists in db, either newly loaded or already there
                    if not check_exist_author:
                        logger.info("Failed to match author [%s] in database" % author)
                    else:
                        # author exists, check if this book by this author is in our database
                        bookid = find_book_in_db(myDB, author, book)
                        if bookid:
                            # check if book is already marked as "Open" (if so, we already had it)
                            check_status = myDB.action("SELECT Status from books where BookID=?", [bookid]).fetchone()
                            if check_status["Status"] != "Open":
                                # update status as we've got this book
                                myDB.action("UPDATE books set Status=? where BookID=?", [u"Open", bookid])
                                book_file = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING)
                                # update book location so we can check if it gets removed, or maybe allow click-to-open?
                                myDB.action("UPDATE books set BookFile=? where BookID=?", [book_file, bookid])
                                new_book_count += 1

    cachesize = myDB.action("select count(*) from languages").fetchone()
    logger.info("%s new/modified books found and added to the database" % new_book_count)
    logger.info("%s files processed" % file_count)
    stats = myDB.action(
        "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats"
    ).fetchone()
    if lazylibrarian.BOOK_API == "GoogleBooks":
        logger.info("GoogleBooks was hit %s times for books" % stats["sum(GR_book_hits)"])
        logger.info("GoogleBooks language was changed %s times" % stats["sum(GB_lang_change)"])
    if lazylibrarian.BOOK_API == "GoodReads":
        logger.info("GoodReads was hit %s times for books" % stats["sum(GR_book_hits)"])
        logger.info("GoodReads was hit %s times for languages" % stats["sum(GR_lang_hits)"])
    logger.info("LibraryThing was hit %s times for languages" % stats["sum(LT_lang_hits)"])
    logger.info("Language cache was hit %s times" % stats["sum(cache_hits)"])
    logger.info("Unwanted language removed %s books" % stats["sum(bad_lang)"])
    logger.info("Unwanted characters removed %s books" % stats["sum(bad_char)"])
    logger.info("Unable to cache %s books with missing ISBN" % stats["sum(uncached)"])
    logger.info("ISBN Language cache holds %s entries" % cachesize["count(*)"])
    stats = len(myDB.select("select BookID from Books where status=? and BookLang=?", ["Open", "Unknown"]))
    logger.info("There are %s books in your library with unknown language" % stats)

    logger.info("Updating %i authors" % len(new_authors))
    for auth in new_authors:
        havebooks = len(myDB.select("select BookName from Books where status=? and AuthorName=?", ["Open", auth]))
        myDB.action("UPDATE authors set HaveBooks=? where AuthorName=?", [havebooks, auth])
        totalbooks = len(myDB.select("select BookName from Books where status!=? and AuthorName=?", ["Ignored", auth]))
        myDB.action("UPDATE authors set UnignoredBooks=? where AuthorName=?", [totalbooks, auth])

    logger.info("Library scan complete")

Exemplo n.º 10

0

Exibir arquivo

def find_book_in_db(myDB, author, book):
    # PAB fuzzy search for book in library, return LL bookid if found or zero
    # if not, return bookid to more easily update status
    # prefer an exact match on author & book
    match = myDB.match('SELECT BookID FROM books where AuthorName="%s" and BookName="%s"' %
                       (author.replace('"', '""'), book.replace('"', '""')))
    if match:
        logger.debug('Exact match [%s]' % book)
        return match['BookID']
    else:
        # Try a more complex fuzzy match against each book in the db by this author
        # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>85)
        # These are results that work well on my library, minimal false matches and no misses
        # on books that should be matched
        # Maybe make ratios configurable in config.ini later

        books = myDB.select('SELECT BookID,BookName,BookISBN FROM books where AuthorName="%s"' %
                            author.replace('"', '""'))
        best_ratio = 0
        best_partial = 0
        best_partname = 0
        ratio_name = ""
        partial_name = ""
        partname_name = ""
        ratio_id = 0
        partial_id = 0
        partname_id = 0
        partname = 0

        book_lower = unaccented(book.lower())
        book_partname, book_sub = split_title(author, book_lower)
        if book_partname == book_lower:
            book_partname = ''

        for a_book in books:
            # tidy up everything to raise fuzziness scores
            # still need to lowercase for matching against partial_name later on
            a_book_lower = unaccented(a_book['BookName'].lower())
            #
            ratio = fuzz.ratio(book_lower, a_book_lower)
            partial = fuzz.partial_ratio(book_lower, a_book_lower)
            if book_partname:
                partname = fuzz.partial_ratio(book_partname, a_book_lower)

            # lose a point for each extra word in the fuzzy matches so we get the closest match
            words = len(getList(book_lower))
            words -= len(getList(a_book_lower))
            ratio -= abs(words)
            partial -= abs(words)

            if ratio > best_ratio:
                best_ratio = ratio
                ratio_name = a_book['BookName']
                ratio_id = a_book['BookID']
            if partial > best_partial:
                best_partial = partial
                partial_name = a_book['BookName']
                partial_id = a_book['BookID']
            if partname > best_partname:
                best_partname = partname
                partname_name = a_book['BookName']
                partname_id = a_book['BookID']

            if partial == best_partial:
                # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest
                # this eliminates most false matches against omnibuses when we want a single book
                # find the position of the shortest string in the longest
                if len(getList(book_lower)) >= len(getList(a_book_lower)):
                    match1 = book_lower.find(a_book_lower)
                else:
                    match1 = a_book_lower.find(book_lower)

                if len(getList(book_lower)) >= len(getList(partial_name.lower())):
                    match2 = book_lower.find(partial_name.lower())
                else:
                    match2 = partial_name.lower().find(book_lower)

                if match1 < match2:
                    logger.debug(
                        "Fuzz left change, prefer [%s] over [%s] for [%s]" %
                        (a_book['BookName'], partial_name, book))
                    best_partial = partial
                    partial_name = a_book['BookName']
                    partial_id = a_book['BookID']

        if best_ratio > 90:
            logger.debug(
                "Fuzz match   ratio [%d] [%s] [%s]" % (best_ratio, book, ratio_name))
            return ratio_id
        if best_partial > 85:
            logger.debug(
                "Fuzz match partial [%d] [%s] [%s]" % (best_partial, book, partial_name))
            return partial_id
        if best_partname > 95:
            logger.debug(
                "Fuzz match partname [%d] [%s] [%s]" % (best_partname, book, partname_name))
            return partname_id

        logger.debug(
            'Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s], partname [%d,%s]' %
            (author, book, best_ratio, ratio_name, best_partial, partial_name, best_partname, partname_name))
        return 0

Exemplo n.º 11

0

Exibir arquivo

Arquivo: gb.py Projeto: andyuc88/LazyLibrarian

    def find_results(self, searchterm=None, queue=None):
        """ GoogleBooks performs much better if we search for author OR title
            not both at once, so if searchterm is not isbn, two searches needed.
            Lazylibrarian searches use <ll> to separate title from author in searchterm
            If this token isn't present, it's an isbn or searchterm as supplied by user
        """
        try:
            myDB = database.DBConnection()
            resultlist = []
            # See if we should check ISBN field, otherwise ignore it
            api_strings = ['inauthor:', 'intitle:']
            if is_valid_isbn(searchterm):
                api_strings = ['isbn:']

            api_hits = 0

            ignored = 0
            total_count = 0
            no_author_count = 0
            title = ''
            authorname = ''

            if ' <ll> ' in searchterm:  # special token separates title from author
                title, authorname = searchterm.split(' <ll> ')

            fullterm = searchterm.replace(' <ll> ', ' ')
            logger.debug('Now searching Google Books API with searchterm: %s' % fullterm)

            for api_value in api_strings:
                set_url = self.url
                if api_value == "isbn:":
                    set_url = set_url + quote(api_value + searchterm)
                elif api_value == 'intitle:':
                    searchterm = fullterm
                    if title:  # just search for title
                        # noinspection PyUnresolvedReferences
                        title = title.split(' (')[0]  # without any series info
                        searchterm = title
                    searchterm = searchterm.replace("'", "").replace('"', '').strip()  # and no quotes
                    if PY2:
                        searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING)
                    set_url = set_url + quote(api_value + '"' + searchterm + '"')
                elif api_value == 'inauthor:':
                    searchterm = fullterm
                    if authorname:
                        searchterm = authorname  # just search for author
                    searchterm = searchterm.strip()
                    if PY2:
                        searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING)
                    set_url = set_url + quote_plus(api_value + '"' + searchterm + '"')

                startindex = 0
                resultcount = 0
                ignored = 0
                number_results = 1
                total_count = 0
                no_author_count = 0
                try:
                    while startindex < number_results:

                        self.params['startIndex'] = startindex
                        URL = set_url + '&' + urlencode(self.params)

                        try:
                            jsonresults, in_cache = gb_json_request(URL)
                            if jsonresults is None:
                                number_results = 0
                            else:
                                if not in_cache:
                                    api_hits += 1
                                number_results = jsonresults['totalItems']
                                logger.debug('Searching url: ' + URL)
                            if number_results == 0:
                                logger.warn('Found no results for %s with value: %s' % (api_value, searchterm))
                                break
                            else:
                                pass
                        except Exception as err:
                            if hasattr(err, 'reason'):
                                errmsg = err.reason
                            else:
                                errmsg = str(err)
                            logger.warn(
                                'Google Books API Error [%s]: Check your API key or wait a while' % errmsg)
                            break

                        startindex += 40

                        for item in jsonresults['items']:
                            total_count += 1

                            book = bookdict(item)
                            if not book['author']:
                                logger.debug('Skipped a result without authorfield.')
                                no_author_count += 1
                                continue

                            if not book['name']:
                                logger.debug('Skipped a result without title.')
                                continue

                            valid_langs = getList(lazylibrarian.CONFIG['IMP_PREFLANG'])
                            if "All" not in valid_langs:  # don't care about languages, accept all
                                try:
                                    # skip if language is not in valid list -
                                    booklang = book['lang']
                                    if booklang not in valid_langs:
                                        logger.debug(
                                            'Skipped %s with language %s' % (book['name'], booklang))
                                        ignored += 1
                                        continue
                                except KeyError:
                                    ignored += 1
                                    logger.debug('Skipped %s where no language is found' % book['name'])
                                    continue

                            if authorname:
                                author_fuzz = fuzz.ratio(book['author'], authorname)
                            else:
                                author_fuzz = fuzz.ratio(book['author'], fullterm)

                            if title:
                                book_fuzz = fuzz.token_set_ratio(book['name'], title)
                                # lose a point for each extra word in the fuzzy matches so we get the closest match
                                words = len(getList(book['name']))
                                words -= len(getList(title))
                                book_fuzz -= abs(words)
                            else:
                                book_fuzz = fuzz.token_set_ratio(book['name'], fullterm)

                            isbn_fuzz = 0
                            if is_valid_isbn(fullterm):
                                isbn_fuzz = 100

                            highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz)

                            dic = {':': '.', '"': '', '\'': ''}
                            bookname = replace_all(book['name'], dic)

                            bookname = unaccented(bookname)
                            bookname = bookname.strip()  # strip whitespace

                            AuthorID = ''
                            if book['author']:
                                match = myDB.match(
                                    'SELECT AuthorID FROM authors WHERE AuthorName=?', (
                                        book['author'].replace('"', '""'),))
                                if match:
                                    AuthorID = match['AuthorID']

                            resultlist.append({
                                'authorname': book['author'],
                                'authorid': AuthorID,
                                'bookid': item['id'],
                                'bookname': bookname,
                                'booksub': book['sub'],
                                'bookisbn': book['isbn'],
                                'bookpub': book['pub'],
                                'bookdate': book['date'],
                                'booklang': book['lang'],
                                'booklink': book['link'],
                                'bookrate': float(book['rate']),
                                'bookrate_count': book['rate_count'],
                                'bookimg': book['img'],
                                'bookpages': book['pages'],
                                'bookgenre': book['genre'],
                                'bookdesc': book['desc'],
                                'author_fuzz': author_fuzz,
                                'book_fuzz': book_fuzz,
                                'isbn_fuzz': isbn_fuzz,
                                'highest_fuzz': highest_fuzz,
                                'num_reviews': book['ratings']
                            })

                            resultcount += 1

                except KeyError:
                    break

                logger.debug("Returning %s result%s for (%s) with keyword: %s" %
                             (resultcount, plural(resultcount), api_value, searchterm))

            logger.debug("Found %s result%s" % (total_count, plural(total_count)))
            logger.debug("Removed %s unwanted language result%s" % (ignored, plural(ignored)))
            logger.debug("Removed %s book%s with no author" % (no_author_count, plural(no_author_count)))
            logger.debug('The Google Books API was hit %s time%s for searchterm: %s' %
                         (api_hits, plural(api_hits), fullterm))
            queue.put(resultlist)

        except Exception:
            logger.error('Unhandled exception in GB.find_results: %s' % traceback.format_exc())

Exemplo n.º 12

0

Exibir arquivo

Arquivo: gb.py Projeto: niprog/LazyLibrarian

    def find_results(self, authorname=None, queue=None):
        threading.currentThread().name = "GB-SEARCH"
        resultlist = []
        # See if we should check ISBN field, otherwise ignore it
        try:
            isbn_check = int(authorname[:-1])
            if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12):
                api_strings = ["isbn:"]
            else:
                api_strings = ["inauthor:", "intitle:"]
        except:
            api_strings = ["inauthor:", "intitle:"]

        api_hits = 0
        logger.info("Now searching Google Books API with keyword: " + self.name)

        for api_value in api_strings:
            startindex = 0
            if api_value == "isbn:":
                set_url = self.url + urllib.quote(api_value + self.name)
            else:
                set_url = self.url + urllib.quote(api_value + '"' + self.name + '"')

            try:
                startindex = 0
                resultcount = 0
                removedResults = 0
                ignored = 0

                total_count = 0
                no_author_count = 0
                while True:

                    self.params["startIndex"] = startindex
                    URL = set_url + "&" + urllib.urlencode(self.params)

                    try:
                        jsonresults = json.JSONDecoder().decode(urllib2.urlopen(URL, timeout=30).read())
                        api_hits = api_hits + 1
                        number_results = jsonresults["totalItems"]
                        logger.debug("Searching url: " + URL)
                        if number_results == 0:
                            logger.info("Found no results for %s with value: %s" % (api_value, self.name))
                            break
                        else:
                            pass
                    except HTTPError, err:
                        logger.warn("Google Books API Error [%s]: Check your API key or wait a while" % err.msg)
                        break

                    startindex = startindex + 40

                    for item in jsonresults["items"]:

                        total_count = total_count + 1

                        # skip if no author, no author is no book.
                        try:
                            Author = item["volumeInfo"]["authors"][0]
                        except KeyError:
                            logger.debug("Skipped a result without authorfield.")
                            no_author_count = no_author_count + 1
                            continue

                        try:
                            # skip if language is in ignore list
                            booklang = item["volumeInfo"]["language"]
                            valid_langs = [valid_lang.strip() for valid_lang in lazylibrarian.IMP_PREFLANG.split(",")]
                            if booklang not in valid_langs:
                                logger.debug("Skipped a book with language %s" % booklang)
                                ignored = ignored + 1
                                continue
                        except KeyError:
                            ignored = ignored + 1
                            logger.debug("Skipped a result where no language is found")
                            continue

                        try:
                            bookpub = item["volumeInfo"]["publisher"]
                        except KeyError:
                            bookpub = None

                        try:
                            booksub = item["volumeInfo"]["subtitle"]
                        except KeyError:
                            booksub = None

                        try:
                            bookdate = item["volumeInfo"]["publishedDate"]
                        except KeyError:
                            bookdate = "0000-00-00"
                        bookdate = bookdate[:4]

                        try:
                            bookimg = item["volumeInfo"]["imageLinks"]["thumbnail"]
                        except KeyError:
                            bookimg = "images/nocover.png"

                        try:
                            bookrate = item["volumeInfo"]["averageRating"]
                        except KeyError:
                            bookrate = 0

                        try:
                            bookpages = item["volumeInfo"]["pageCount"]
                        except KeyError:
                            bookpages = "0"

                        try:
                            bookgenre = item["volumeInfo"]["categories"][0]
                        except KeyError:
                            bookgenre = None

                        try:
                            bookdesc = item["volumeInfo"]["description"]
                        except KeyError:
                            bookdesc = "Not available"

                        try:
                            num_reviews = item["volumeInfo"]["ratingsCount"]
                        except KeyError:
                            num_reviews = 0

                        try:
                            if item["volumeInfo"]["industryIdentifiers"][0]["type"] == "ISBN_10":
                                bookisbn = item["volumeInfo"]["industryIdentifiers"][0]["identifier"]
                            else:
                                bookisbn = 0
                        except KeyError:
                            bookisbn = 0

                        author_fuzz = fuzz.ratio(Author.lower(), authorname.lower())
                        book_fuzz = fuzz.ratio(item["volumeInfo"]["title"].lower(), authorname.lower())
                        try:
                            isbn_check = int(authorname[:-1])
                            if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12):
                                isbn_fuzz = int(100)
                            else:
                                isbn_fuzz = int(0)
                        except:
                            isbn_fuzz = int(0)
                        highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz)

                        resultlist.append(
                            {
                                "authorname": Author,
                                "bookid": item["id"],
                                "bookname": item["volumeInfo"]["title"],
                                "booksub": booksub,
                                "bookisbn": bookisbn,
                                "bookpub": bookpub,
                                "bookdate": bookdate,
                                "booklang": booklang,
                                "booklink": item["volumeInfo"]["canonicalVolumeLink"],
                                "bookrate": float(bookrate),
                                "bookimg": bookimg,
                                "bookpages": bookpages,
                                "bookgenre": bookgenre,
                                "bookdesc": bookdesc,
                                "author_fuzz": author_fuzz,
                                "book_fuzz": book_fuzz,
                                "isbn_fuzz": isbn_fuzz,
                                "highest_fuzz": highest_fuzz,
                                "num_reviews": num_reviews,
                            }
                        )

                        resultcount = resultcount + 1

                    if startindex >= number_results:
                        logger.debug("Found %s total results" % total_count)
                        logger.debug("Removed %s bad language results" % ignored)
                        logger.debug("Removed %s books with no author" % no_author_count)
                        logger.info(
                            "Showing %s results for (%s) with keyword: %s" % (resultcount, api_value, authorname)
                        )
                        break
                    else:
                        continue

            except KeyError:
                break

        logger.info("The Google Books API was hit %s times for keyword %s" % (str(api_hits), self.name))
        queue.put(resultlist)

Exemplo n.º 13

0

Exibir arquivo

Arquivo: gr.py Projeto: mcaron1234/LazyLibrarian

    def find_results(self, searchterm=None, queue=None):
        try:
            resultlist = []
            api_hits = 0
            searchtitle = ''
            searchauthorname = ''

            if ' <ll> ' in searchterm:  # special token separates title from author
                searchtitle, searchauthorname = searchterm.split(' <ll> ')
                searchterm = searchterm.replace(' <ll> ', ' ')

            searchterm = searchterm.encode(lazylibrarian.SYS_ENCODING)
            url = urllib.quote_plus(searchterm)
            set_url = 'https://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params)
            logger.debug('Now searching GoodReads API with searchterm: %s' % searchterm)
            # logger.debug('Searching for %s at: %s' % (searchterm, set_url))

            resultcount = 0
            try:
                try:
                    rootxml, in_cache = get_xml_request(set_url)
                except Exception as e:
                    logger.error("%s finding gr results: %s" % (type(e).__name__, str(e)))
                    return
                if rootxml is None:
                    logger.debug("Error requesting results")
                    return

                totalresults = check_int(rootxml.find('search/total-results').text, 0)

                resultxml = rootxml.getiterator('work')
                loopCount = 1
                while resultxml:
                    for author in resultxml:
                        try:
                            if author.find('original_publication_year').text is None:
                                bookdate = "0000"
                            else:
                                bookdate = author.find('original_publication_year').text
                        except (KeyError, AttributeError):
                            bookdate = "0000"

                        try:
                            authorNameResult = author.find('./best_book/author/name').text
                            # Goodreads sometimes puts extra whitepase in the author names!
                            authorNameResult = ' '.join(authorNameResult.split())
                        except (KeyError, AttributeError):
                            authorNameResult = ""

                        booksub = ""
                        bookpub = ""
                        booklang = "Unknown"

                        try:
                            bookimg = author.find('./best_book/image_url').text
                            if bookimg == 'https://www.goodreads.com/assets/nocover/111x148.png':
                                bookimg = 'images/nocover.png'
                        except (KeyError, AttributeError):
                            bookimg = 'images/nocover.png'

                        try:
                            bookrate = author.find('average_rating').text
                        except KeyError:
                            bookrate = 0

                        bookpages = '0'
                        bookgenre = ''
                        bookdesc = ''
                        bookisbn = ''

                        try:
                            booklink = 'https://www.goodreads.com/book/show/' + author.find('./best_book/id').text
                        except (KeyError, AttributeError):
                            booklink = ""

                        try:
                            authorid = author.find('./best_book/author/id').text
                        except (KeyError, AttributeError):
                            authorid = ""

                        try:
                            if author.find('./best_book/title').text is None:
                                bookTitle = ""
                            else:
                                bookTitle = author.find('./best_book/title').text
                        except (KeyError, AttributeError):
                            bookTitle = ""

                        if searchauthorname:
                            author_fuzz = fuzz.ratio(authorNameResult, searchauthorname)
                        else:
                            author_fuzz = fuzz.ratio(authorNameResult, searchterm)
                        if searchtitle:
                            book_fuzz = fuzz.token_set_ratio(bookTitle, searchtitle)
                            # lose a point for each extra word in the fuzzy matches so we get the closest match
                            words = len(getList(bookTitle))
                            words -= len(getList(searchtitle))
                            book_fuzz -= abs(words)
                        else:
                            book_fuzz = fuzz.token_set_ratio(bookTitle, searchterm)
                            words = len(getList(bookTitle))
                            words -= len(getList(searchterm))
                            book_fuzz -= abs(words)
                        isbn_fuzz = 0
                        if is_valid_isbn(searchterm):
                            isbn_fuzz = 100

                        highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz)

                        try:
                            bookid = author.find('./best_book/id').text
                        except (KeyError, AttributeError):
                            bookid = ""

                        resultlist.append({
                            'authorname': authorNameResult,
                            'bookid': bookid,
                            'authorid': authorid,
                            'bookname': bookTitle,
                            'booksub': booksub,
                            'bookisbn': bookisbn,
                            'bookpub': bookpub,
                            'bookdate': bookdate,
                            'booklang': booklang,
                            'booklink': booklink,
                            'bookrate': float(bookrate),
                            'bookimg': bookimg,
                            'bookpages': bookpages,
                            'bookgenre': bookgenre,
                            'bookdesc': bookdesc,
                            'author_fuzz': author_fuzz,
                            'book_fuzz': book_fuzz,
                            'isbn_fuzz': isbn_fuzz,
                            'highest_fuzz': highest_fuzz,
                            'num_reviews': float(bookrate)
                        })

                        resultcount += 1

                    loopCount += 1

                    if 0 < lazylibrarian.CONFIG['MAX_PAGES'] < loopCount:
                        resultxml = None
                        logger.warn('Maximum results page search reached, still more results available')
                    elif totalresults and resultcount >= totalresults:
                        # fix for goodreads bug on isbn searches
                        resultxml = None
                    else:
                        URL = set_url + '&page=' + str(loopCount)
                        resultxml = None
                        try:
                            rootxml, in_cache = get_xml_request(URL)
                            if rootxml is None:
                                logger.debug('Error requesting page %s of results' % loopCount)
                            else:
                                resultxml = rootxml.getiterator('work')
                                if not in_cache:
                                    api_hits += 1
                        except Exception as e:
                            resultxml = None
                            logger.error("%s finding page %s of results: %s" % (type(e).__name__, loopCount, str(e)))

                    if resultxml:
                        if all(False for _ in resultxml):  # returns True if iterator is empty
                            resultxml = None

            except Exception as err:
                if err.code == 404:
                    logger.error('Received a 404 error when searching for author')
                if err.code == 403:
                    logger.warn('Access to api is denied: usage exceeded')
                else:
                    logger.error('An unexpected error has occurred when searching for an author: %s' % str(err))

            logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), searchterm))
            logger.debug(
                'The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), searchterm))

            queue.put(resultlist)

        except Exception:
            logger.error('Unhandled exception in GR.find_results: %s' % traceback.format_exc())

Exemplo n.º 14

0

Exibir arquivo

Arquivo: gr.py Projeto: niprog/LazyLibrarian

    def find_results(self, authorname=None, queue=None):
        threading.currentThread().name = "GR-SEARCH"
        resultlist = []
        api_hits = 0
        url = urllib.quote_plus(authorname.encode("utf-8"))
        set_url = "http://www.goodreads.com/search.xml?q=" + url + "&" + urllib.urlencode(self.params)
        logger.info("Now searching GoodReads API with keyword: " + authorname)
        logger.debug("Searching for %s at: %s" % (authorname, set_url))

        try:

            try:
                # Cache our request
                request = urllib2.Request(set_url)
                opener = urllib2.build_opener(
                    SimpleCache.CacheHandler(".AuthorCache"), SimpleCache.ThrottlingProcessor(5)
                )
                resp = opener.open(request)
                api_hits = api_hits + 1
                sourcexml = ElementTree.parse(resp)
            except Exception, e:
                logger.error("Error finding results: " + str(e))

            rootxml = sourcexml.getroot()
            resultxml = rootxml.getiterator("work")
            author_dict = []
            resultcount = 0
            for author in resultxml:
                bookdate = "0001-01-01"

                if author.find("original_publication_year").text == None:
                    bookdate = "0000"
                else:
                    bookdate = author.find("original_publication_year").text

                authorNameResult = author.find("./best_book/author/name").text
                booksub = ""
                bookpub = ""
                booklang = "en"

                try:
                    bookimg = author.find("./best_book/image_url").text
                    if bookimg == "http://www.goodreads.com/assets/nocover/111x148.png":
                        bookimg = "images/nocover.png"
                except KeyError:
                    bookimg = "images/nocover.png"
                except AttributeError:
                    bookimg = "images/nocover.png"

                try:
                    bookrate = author.find("average_rating").text
                except KeyError:
                    bookrate = 0

                bookpages = "0"
                bookgenre = ""
                bookdesc = ""
                bookisbn = ""
                booklink = "http://www.goodreads.com/book/show/" + author.find("./best_book/id").text

                if author.find("./best_book/title").text == None:
                    bookTitle = ""
                else:
                    bookTitle = author.find("./best_book/title").text

                author_fuzz = fuzz.ratio(authorNameResult.lower(), authorname.lower())
                book_fuzz = fuzz.ratio(bookTitle.lower(), authorname.lower())
                try:
                    isbn_check = int(authorname[:-1])
                    if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12):
                        isbn_fuzz = int(100)
                    else:
                        isbn_fuzz = int(0)
                except:
                    isbn_fuzz = int(0)
                highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz)

                resultlist.append(
                    {
                        "authorname": author.find("./best_book/author/name").text,
                        "bookid": author.find("./best_book/id").text,
                        "authorid": author.find("./best_book/author/id").text,
                        "bookname": bookTitle.encode("ascii", "ignore"),
                        "booksub": booksub,
                        "bookisbn": bookisbn,
                        "bookpub": bookpub,
                        "bookdate": bookdate,
                        "booklang": booklang,
                        "booklink": booklink,
                        "bookrate": float(bookrate),
                        "bookimg": bookimg,
                        "bookpages": bookpages,
                        "bookgenre": bookgenre,
                        "bookdesc": bookdesc,
                        "author_fuzz": author_fuzz,
                        "book_fuzz": book_fuzz,
                        "isbn_fuzz": isbn_fuzz,
                        "highest_fuzz": highest_fuzz,
                        "num_reviews": float(bookrate),
                    }
                )

                resultcount = resultcount + 1

Exemplo n.º 15

0

Exibir arquivo

	def find_results(self, authorname=None, queue=None):
		threading.currentThread().name = "GR-SEARCH"
		resultlist = []
		api_hits = 0
		url = urllib.quote_plus(authorname.encode('utf-8'))
		set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params)
		logger.info('Now searching GoodReads API with keyword: ' + authorname)
		logger.debug('Searching for %s at: %s' % (authorname, set_url))

		try:

			try:
				# Cache our request
				request = urllib2.Request(set_url)
				if lazylibrarian.PROXY_HOST:
					request.set_proxy(lazylibrarian.PROXY_HOST, lazylibrarian.PROXY_TYPE)
				request.add_header('User-Agent', USER_AGENT)
				opener = urllib2.build_opener(SimpleCache.CacheHandler(".AuthorCache"), SimpleCache.ThrottlingProcessor(5))
				resp = opener.open(request)
				api_hits = api_hits + 1
				sourcexml = ElementTree.parse(resp)
			except Exception, e:
				logger.error("Error finding results: " + str(e))

			rootxml = sourcexml.getroot()
			resultxml = rootxml.getiterator('work')
			author_dict = []
			resultcount = 0
			for author in resultxml:
				bookdate = "0001-01-01"

				if (author.find('original_publication_year').text == None):
					bookdate = "0000"
				else:
					bookdate = author.find('original_publication_year').text

				authorNameResult = author.find('./best_book/author/name').text
				booksub = ""
				bookpub = ""
				booklang = "en"

				try:
					bookimg = author.find('./best_book/image_url').text
					if (bookimg == 'http://www.goodreads.com/assets/nocover/111x148.png'):
						bookimg = 'images/nocover.png'
				except KeyError:
					bookimg = 'images/nocover.png'
				except AttributeError:
					bookimg = 'images/nocover.png'

				try:
					bookrate = author.find('average_rating').text
				except KeyError:
					bookrate = 0

				bookpages = '0'
				bookgenre = ''
				bookdesc = ''
				bookisbn = ''
				booklink = 'http://www.goodreads.com/book/show/'+author.find('./best_book/id').text

				if (author.find('./best_book/title').text == None):
					bookTitle = ""
				else:
					bookTitle = author.find('./best_book/title').text

				author_fuzz = fuzz.ratio(authorNameResult.lower(), authorname.lower())
				book_fuzz = fuzz.ratio(bookTitle.lower(), authorname.lower())
				try:
					isbn_check = int(authorname[:-1])
					if (len(str(isbn_check)) == 9) or (len(str(isbn_check)) == 12):
						isbn_fuzz = int(100)
					else:
						isbn_fuzz = int(0)
				except:
					isbn_fuzz = int(0)
				highest_fuzz = max(author_fuzz, book_fuzz, isbn_fuzz)

				resultlist.append({
					'authorname': author.find('./best_book/author/name').text,
					'bookid': author.find('./best_book/id').text,
					'authorid' : author.find('./best_book/author/id').text,
					'bookname': bookTitle.encode("ascii", "ignore"),
					'booksub': booksub,
					'bookisbn': bookisbn,
					'bookpub': bookpub,
					'bookdate': bookdate,
					'booklang': booklang,
					'booklink': booklink,
					'bookrate': float(bookrate),
					'bookimg': bookimg,
					'bookpages': bookpages,
					'bookgenre': bookgenre,
					'bookdesc': bookdesc,
					'author_fuzz': author_fuzz,
					'book_fuzz': book_fuzz,
					'isbn_fuzz': isbn_fuzz,
					'highest_fuzz': highest_fuzz,
					'num_reviews': float(bookrate)
				})

				resultcount = resultcount+1

Exemplo n.º 16

0

Exibir arquivo

Arquivo: searchmag.py Projeto: duckville/LazyLibrarian

def search_magazines(mags=None):
    # produce a list of magazines to search for, tor, nzb, torznab

    myDB = database.DBConnection()
    searchlist = []
    threading.currentThread().name = "SEARCHMAGS"

    if mags is None:  # backlog search
        searchmags = myDB.select('SELECT Title, Frequency, LastAcquired, \
                                 IssueDate from magazines WHERE Status="Active"')
    else:
        searchmags = []
        for magazine in mags:
            searchmags_temp = myDB.select('SELECT Title, Frequency, LastAcquired, IssueDate from magazines \
                                          WHERE Title="%s" AND Status="Active"' % (magazine['bookid']))
            for terms in searchmags_temp:
                searchmags.append(terms)

    if len(searchmags) == 1:
        logger.info('Searching for one magazine')
    else:
        logger.info('Searching for %i magazines' % len(searchmags))

    for searchmag in searchmags:
        bookid = searchmag[0]
        searchterm = searchmag[0]
        frequency = searchmag[1]
        # last_acquired = searchmag[2]
        # issue_date = searchmag[3]

        dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''}

        searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic))
        searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8')
        searchlist.append({"bookid": bookid, "searchterm": searchterm})

    if searchlist == []:
        logger.warn('There is nothing to search for.  Mark some magazines as active.')

    for book in searchlist:

        resultlist = []
        tor_resultlist = []
        if lazylibrarian.USE_NZB:
            resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag')
            if not nproviders:
                logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers')

        if lazylibrarian.USE_TOR:
            tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag')
            if not nproviders:
                logger.warn('No torrent providers are set. Check config for TORRENT providers')

            for item in tor_resultlist:  # reformat the torrent results so they look like nzbs
                resultlist.append({
                    'bookid': item['bookid'],
                    'nzbprov': item['tor_prov'],
                    'nzbtitle': item['tor_title'],
                    'nzburl': item['tor_url'],
                    'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100',  # fake date as none returned from torrents
                    'nzbsize': item['tor_size'],
                    'nzbmode': 'torrent'
                })

        if not resultlist:
            logger.debug("Adding magazine %s to queue." % book['searchterm'])

        else:
            bad_regex = 0
            bad_date = 0
            old_date = 0
            total_nzbs = 0
            new_date = 0
            to_snatch = 0
            maglist = []
            issues = []
            for nzb in resultlist:
                total_nzbs = total_nzbs + 1
                bookid = nzb['bookid']
                nzbtitle = (u'%s' % nzb['nzbtitle'])
                nzbtitle = nzbtitle.replace('"', '').replace("'", "")  # suppress " in titles
                nzburl = nzb['nzburl']
                nzbprov = nzb['nzbprov']
                nzbdate_temp = nzb['nzbdate']
                nzbsize_temp = nzb['nzbsize']
                if nzbsize_temp is None:  # not all torrents returned by torznab have a size
                    nzbsize_temp = 1000
                nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB'
                nzbdate = formatter.nzbdate2format(nzbdate_temp)
                nzbmode = nzb['nzbmode']
                checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid)
                if checkifmag:
                    for results in checkifmag:
                        control_date = results['IssueDate']
                        frequency = results['Frequency']
                        # regex = results['Regex']

                    nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace(
                                            '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip()
                    # Need to make sure that substrings of magazine titles don't get found
                    # (e.g. Maxim USA will find Maximum PC USA)
                    # keyword_check = nzbtitle_formatted.replace(bookid, '')
                    # remove extra spaces if they're in a row
                    nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split())
                    nzbtitle_exploded = nzbtitle_exploded_temp.split(' ')

                    if ' ' in bookid:
                        bookid_exploded = bookid.split(' ')
                    else:
                        bookid_exploded = [bookid]

                    # check nzb starts with magazine title, and ends with a date
                    # eg The MagPI Issue 22 - July 2015
                    # do something like check left n words match title
                    # then check last n words are a date

                    name_match = 1  # assume name matches for now
                    name_len = len(bookid_exploded)
                    if len(nzbtitle_exploded) > name_len:  # needs to be longer as it should include a date
                        while name_len:
                            name_len = name_len - 1
                            # fuzzy check on each word in the magazine name with any accents stripped
                            # fuzz.ratio doesn't lowercase for us
                            ratio = fuzz.ratio(common.remove_accents(nzbtitle_exploded[name_len].lower()),
                                               common.remove_accents(bookid_exploded[name_len].lower()))
                            if ratio < 80:  # hard coded fuzz ratio for now, works for close matches
                                logger.debug("Magazine fuzz ratio failed [%d] [%s] [%s]" % (
                                             ratio, bookid, nzbtitle_formatted))
                                name_match = 0  # name match failed
                    if name_match:
                        # some magazine torrent uploaders add their sig in [] or {}
                        # Fortunately for us, they always seem to add it at the end
                        # some magazine torrent titles are "magazine_name some_form_of_date pdf"
                        # so strip all the trailing junk...
                        while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \
                            nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf':
                                nzbtitle_exploded.pop()  # gotta love the function names
                        
                        if len(nzbtitle_exploded) > 1:
                            # regexA = DD MonthName YYYY OR MonthName YYYY or nn MonthName YYYY
                            regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1]
                            if regexA_year.isdigit():
                                if int(regexA_year) < 1900 or int(regexA_year) > 2100:
                                    regexA_year = 'Invalid'
                            regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2]
                            regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp))

                            if frequency == "Weekly" or frequency == "BiWeekly":
                                regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2)
                                if regexA_day.isdigit():
                                    if int(regexA_day) > 31:  # probably issue number nn
                                        regexA_day = '01'
                                else:
                                    regexA_day = '01'  # just MonthName YYYY
                            else:
                                regexA_day = '01'  # monthly, or less frequent
                            
                            newdatish_regexA = regexA_year + regexA_month + regexA_day

                            try:
                                int(newdatish_regexA)
                                newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day
                            except:
                                # regexB = MonthName DD YYYY
                                regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1]
                                regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2)
                                regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3]
                                regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp))
                                newdatish_regexB = regexB_year + regexB_month + regexB_day

                                try:
                                    int(newdatish_regexB)
                                    newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day
                                except:
                                    # regexC = YYYY MM or YYYY MM DD or Issue nn YYYY
                                    # (can't get MM/DD if named Issue nn)
                                    newdatish_regexC = 'Invalid'  # invalid unless works out otherwise
                                    regexC_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2]
                                    if regexC_temp.isdigit():
                                        if int(regexC_temp) > 1900 and int(regexC_temp) < 2100:  # YYYY MM  or YYYY nn
                                            regexC_year = regexC_temp
                                            regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2)
                                            regexC_day = '01'
                                            if regexC_month.isdigit():  # could be YYYY nn where nn is issue number
                                                if int(regexC_month) < 13:
                                                    # if issue number > 12 date matching will fail
                                                    newdatish_regexC = regexC_year + regexC_month + regexC_day
                                        else:
                                            regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3]
                                            if regexC_year.isdigit():
                                                if int(regexC_year) > 1900 and int(regexC_year) < 2100:  # YYYY MM DD or YYYY nn-nn
                                                    regexC_month = regexC_temp.zfill(2)
                                                    if int(regexC_month) < 13:
                                                        # if issue number > 12 date matching will fail
                                                        regexC_day = nzbtitle_exploded[len(
                                                                nzbtitle_exploded) - 1].zfill(2)
                                                        newdatish_regexC = regexC_year + regexC_month + regexC_day

                                    try:
                                        int(newdatish_regexC)
                                        newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day
                                    except:
                                        logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted)
                                        bad_date = bad_date + 1
                                        # allow issues with good name but bad date to be included
                                        # so user can manually select them
                                        newdatish = "1970-01-01"  # provide a fake date for bad-date issues
                                        # continue

                        else:
                            continue

                        # Don't want to overwrite status = Skipped for NZBs that have been previously found
                        wanted_status = myDB.select('SELECT * from wanted WHERE NZBtitle="%s"' % nzbtitle)
                        if wanted_status:
                            for results in wanted_status:
                                status = results['Status']
                        else:
                            status = "Skipped"

                        controlValueDict = {"NZBurl": nzburl}
                        newValueDict = {
                            "NZBprov": nzbprov,
                            "BookID": bookid,
                            "NZBdate": nzbdate,
                            "NZBtitle": nzbtitle,
                            "AuxInfo": newdatish,
                            "Status": status,
                            "NZBsize": nzbsize,
                            "NZBmode": nzbmode
                        }
                        myDB.upsert("wanted", newValueDict, controlValueDict)

                        if control_date is None:  # we haven't got any copies of this magazine yet
                            # get a rough time just over a month ago to compare to, in format yyyy-mm-dd
                            # could perhaps calc differently for weekly, biweekly etc
                            start_time = time.time()
                            start_time -= 31 * 24 * 60 * 60  # number of seconds in 31 days
                            control_date = time.strftime("%Y-%m-%d", time.localtime(start_time))

                        # only grab a copy if it's newer than the most recent we have,
                        # or newer than a month ago if we have none
                        comp_date = formatter.datecompare(newdatish, control_date)
                        if comp_date > 0:
                            # Should probably only upsert when downloaded and processed in case snatch fails
                            # keep track of what we're going to download so we don't download dupes
                            new_date = new_date + 1
                            issue = bookid + ',' + newdatish
                            if issue not in issues:
                                maglist.append({
                                    'bookid': bookid,
                                    'nzbprov': nzbprov,
                                    'nzbtitle': nzbtitle,
                                    'nzburl': nzburl,
                                    'nzbmode': nzbmode
                                })
                                logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted)
                                to_snatch = to_snatch + 1
                                issues.append(issue)
                            else:
                                logger.debug('This issue of %s is already flagged for download' % issue)
                        else:
                            if newdatish != "1970-01-01":  # this is our fake date for ones we can't decipher
                                logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted)
                                old_date = old_date + 1
                    else:
                        logger.debug('Magazine [%s] does not completely match search term [%s].' % (
                                     nzbtitle_formatted, bookid))
                        bad_regex = bad_regex + 1

            logger.info('Found %s results for %s.  %s are new, %s are old, %s fail date, %s fail name matching' % (
                        total_nzbs, bookid, new_date, old_date, bad_date, bad_regex))
            logger.info("%s, %s issues to download" % (bookid, to_snatch))

            for items in maglist:
                if items['nzbmode'] == "torznab":
                    snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                elif items['nzbmode'] == "torrent":
                    snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                else:
                    snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                if snatch:
                    notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now())
                    postprocess.schedule_processor(action='Start')
            maglist = []
    logger.info("Search for magazines complete")

Exemplo n.º 17

0

Exibir arquivo

Arquivo: librarysync.py Projeto: NVRemoteDev/LazyLibrarian

def find_book_in_db(myDB, author, book):
    # PAB fuzzy search for book in library, return LL bookid if found or zero
    # if not, return bookid to more easily update status
    # prefer an exact match on author & book
    match = myDB.action(
        'SELECT BookID FROM books where AuthorName="%s" and BookName="%s"' %
        (author, book)).fetchone()
    if match:
        logger.debug('Exact match [%s]' % book)
        return match['BookID']
    else:
        # No exact match
        # Try a more complex fuzzy match against each book in the db by this author
        # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>65)
        # These are results that work well on my library, minimal false matches and no misses
        # on books that should be matched
        # Maybe make ratios configurable in config.ini later

        books = myDB.select('SELECT BookID,BookName FROM books where AuthorName="%s"' % author)
        best_ratio = 0
        best_partial = 0
        ratio_name = ""
        partial_name = ""
        ratio_id = 0
        partial_id = 0
        #logger.debug("Found %s books for %s" % (len(books), author))
        for a_book in books:
            # tidy up everything to raise fuzziness scores
            # still need to lowercase for matching against partial_name later on
            book_lower = common.remove_accents(book.lower())
            a_book_lower = common.remove_accents(a_book['BookName'].lower())
            #
            ratio = fuzz.ratio(book_lower, a_book_lower)
            partial = fuzz.partial_ratio(book_lower, a_book_lower)
            if ratio > best_ratio:
                best_ratio = ratio
                ratio_name = a_book['BookName']
                ratio_id = a_book['BookID']
            if partial > best_partial:
                best_partial = partial
                partial_name = a_book['BookName']
                partial_id = a_book['BookID']

            else:
                if partial == best_partial:
                    # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest
                    # this eliminates most false matches against omnibuses
                    if a_book_lower.find(book_lower) < partial_name.lower().find(book_lower):
                        logger.debug(
                            "Fuzz left prefer [%s] over [%s]" %
                            (a_book['BookName'], partial_name))
                        best_partial = partial
                        partial_name = a_book['BookName']
                        partial_id = a_book['BookID']
            #
        if best_ratio > 90:
            logger.debug(
                "Fuzz match   ratio [%d] [%s] [%s]" %
                (best_ratio, book, ratio_name))
            return ratio_id
        if best_partial > 65:
            logger.debug(
                "Fuzz match partial [%d] [%s] [%s]" %
                (best_partial, book, partial_name))
            return partial_id

        logger.debug(
            'Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s]' %
            (author, book, best_ratio, ratio_name, best_partial, partial_name))
        return 0

Exemplo n.º 18

0

Exibir arquivo

Arquivo: librarysync.py Projeto: forge33/LazyLibrarian

def LibraryScan(startdir=None):
    """ Scan a directory tree adding new books into database
        Return how many books you added """
    if not startdir:
        if not lazylibrarian.DESTINATION_DIR:
            return 0
        else:
            startdir = lazylibrarian.DESTINATION_DIR

    if not os.path.isdir(startdir):
        logger.warn(
            'Cannot find directory: %s. Not scanning' % startdir)
        return 0

    myDB = database.DBConnection()

    # keep statistics of full library scans
    if startdir == lazylibrarian.DESTINATION_DIR:
        myDB.action('DELETE from stats')

    logger.info('Scanning ebook directory: %s' % startdir)

    new_book_count = 0
    file_count = 0
    author = ""

    if lazylibrarian.FULL_SCAN and startdir == lazylibrarian.DESTINATION_DIR:
        books = myDB.select(
            'select AuthorName, BookName, BookFile, BookID from books where Status="Open"')
        status = lazylibrarian.NOTFOUND_STATUS
        logger.info('Missing books will be marked as %s' % status)
        for book in books:
            bookName = book['BookName']
            bookAuthor = book['AuthorName']
            bookID = book['BookID']
            bookfile = book['BookFile']

            if not(bookfile and os.path.isfile(bookfile)):
                myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID))
                myDB.action('update books set BookFile="" where BookID="%s"' % bookID)
                logger.warn('Book %s - %s updated as not found on disk' % (bookAuthor, bookName))

    # to save repeat-scans of the same directory if it contains multiple formats of the same book,
    # keep track of which directories we've already looked at
    processed_subdirectories = []

    matchString = ''
    for char in lazylibrarian.EBOOK_DEST_FILE:
        matchString = matchString + '\\' + char
    # massage the EBOOK_DEST_FILE config parameter into something we can use
    # with regular expression matching
    booktypes = ''
    count = -1
    booktype_list = getList(lazylibrarian.EBOOK_TYPE)
    for book_type in booktype_list:
        count += 1
        if count == 0:
            booktypes = book_type
        else:
            booktypes = booktypes + '|' + book_type
    matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace(
        "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']'
    pattern = re.compile(matchString, re.VERBOSE)

    for r, d, f in os.walk(startdir):
        for directory in d[:]:
            # prevent magazine being scanned
            if directory.startswith("_") or directory.startswith("."):
                d.remove(directory)

        for files in f:
            file_count += 1

            if isinstance(r, str):
                r = r.decode(lazylibrarian.SYS_ENCODING)

            subdirectory = r.replace(startdir, '')
            # Added new code to skip if we've done this directory before.
            # Made this conditional with a switch in config.ini
            # in case user keeps multiple different books in the same subdirectory
            if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories):
                logger.debug("[%s] already scanned" % subdirectory)
            else:
                # If this is a book, try to get author/title/isbn/language
                # if epub or mobi, read metadata from the book
                # If metadata.opf exists, use that allowing it to override
                # embedded metadata. User may have edited metadata.opf
                # to merge author aliases together
                # If all else fails, try pattern match for author/title
                # and look up isbn/lang from LT or GR later
                match = 0
                if is_valid_booktype(files):

                    logger.debug("[%s] Now scanning subdirectory %s" %
                                 (startdir, subdirectory))

                    language = "Unknown"
                    isbn = ""
                    book = ""
                    author = ""
                    extn = os.path.splitext(files)[1]

                    # if it's an epub or a mobi we can try to read metadata from it
                    if (extn == ".epub") or (extn == ".mobi"):
                        book_filename = os.path.join(
                            r.encode(lazylibrarian.SYS_ENCODING), files.encode(lazylibrarian.SYS_ENCODING))

                        try:
                            res = get_book_info(book_filename)
                        except:
                            res = {}
                        if 'title' in res and 'creator' in res:  # this is the minimum we need
                            match = 1
                            book = res['title']
                            author = res['creator']
                            if 'language' in res:
                                language = res['language']
                            if 'identifier' in res:
                                isbn = res['identifier']
                            if 'type' in res:
                                extn = res['type']

                            logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" %
                                         (isbn, language, author, book, extn))
                        else:

                            logger.debug("Book meta incomplete in %s" % book_filename)

                    # calibre uses "metadata.opf", LL uses "bookname - authorname.opf"
                    # just look for any .opf file in the current directory since we don't know
                    # LL preferred authorname/bookname at this point.
                    # Allow metadata in file to override book contents as may be users pref

                    metafile = opf_file(r)
                    try:
                        res = get_book_info(metafile)
                    except:
                        res = {}
                    if 'title' in res and 'creator' in res:  # this is the minimum we need
                        match = 1
                        book = res['title']
                        author = res['creator']
                        if 'language' in res:
                            language = res['language']
                        if 'identifier' in res:
                            isbn = res['identifier']
                        logger.debug(
                            "file meta [%s] [%s] [%s] [%s]" %
                            (isbn, language, author, book))
                    else:
                        logger.debug("File meta incomplete in %s" % metafile)

                    if not match:  # no author/book from metadata file, and not embedded either
                        match = pattern.match(files)
                        if match:
                            author = match.group("author")
                            book = match.group("book")
                        else:
                            logger.debug("Pattern match failed [%s]" % files)

                    if match:
                        # flag that we found a book in this subdirectory
                        processed_subdirectories.append(subdirectory)

                        # If we have a valid looking isbn, and language != "Unknown", add it to cache
                        if language != "Unknown" and is_valid_isbn(isbn):
                            logger.debug(
                                "Found Language [%s] ISBN [%s]" %
                                (language, isbn))
                            # we need to add it to language cache if not already
                            # there, is_valid_isbn has checked length is 10 or 13
                            if len(isbn) == 10:
                                isbnhead = isbn[0:3]
                            else:
                                isbnhead = isbn[3:6]
                            match = myDB.action(
                                'SELECT lang FROM languages where isbn = "%s"' %
                                (isbnhead)).fetchone()
                            if not match:
                                myDB.action(
                                    'insert into languages values ("%s", "%s")' %
                                    (isbnhead, language))
                                logger.debug(
                                    "Cached Lang [%s] ISBN [%s]" %
                                    (language, isbnhead))
                            else:
                                logger.debug(
                                    "Already cached Lang [%s] ISBN [%s]" %
                                    (language, isbnhead))

                        # get authors name in a consistent format
                        if "," in author:  # "surname, forename"
                            words = author.split(',')
                            author = words[1].strip() + ' ' + words[0].strip()  # "forename surname"
                        if author[1] == ' ':
                            author = author.replace(' ', '.')
                            author = author.replace('..', '.')

                        # Check if the author exists, and import the author if not,
                        # before starting any complicated book-name matching to save repeating the search
                        #
                        check_exist_author = myDB.action(
                            'SELECT * FROM authors where AuthorName="%s"' %
                            author).fetchone()
                        if not check_exist_author and lazylibrarian.ADD_AUTHOR:
                            # no match for supplied author, but we're allowed to
                            # add new ones

                            GR = GoodReads(author)
                            try:
                                author_gr = GR.find_author_id()
                            except:
                                logger.warn(
                                    "Error finding author id for [%s]" %
                                    author)
                                continue

                            # only try to add if GR data matches found author data
                            if author_gr:
                                authorname = author_gr['authorname']

                                # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
                                match_auth = author.replace('.', '_')
                                match_auth = match_auth.replace(' ', '_')
                                match_auth = match_auth.replace('__', '_')
                                match_name = authorname.replace('.', '_')
                                match_name = match_name.replace(' ', '_')
                                match_name = match_name.replace('__', '_')
                                match_name = unaccented(match_name)
                                match_auth = unaccented(match_auth)
                                # allow a degree of fuzziness to cater for different accented character handling.
                                # some author names have accents,
                                # filename may have the accented or un-accented version of the character
                                # The currently non-configurable value of fuzziness might need to go in config
                                # We stored GoodReads unmodified author name in
                                # author_gr, so store in LL db under that
                                # fuzz.ratio doesn't lowercase for us
                                match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower())
                                if match_fuzz < 90:
                                    logger.debug(
                                        "Failed to match author [%s] fuzz [%d]" %
                                        (author, match_fuzz))
                                    logger.debug(
                                        "Failed to match author [%s] to authorname [%s]" %
                                        (match_auth, match_name))

                                # To save loading hundreds of books by unknown
                                # authors at GR or GB, ignore if author "Unknown"
                                if (author != "Unknown") and (match_fuzz >= 90):
                                    # use "intact" name for author that we stored in
                                    # GR author_dict, not one of the various mangled versions
                                    # otherwise the books appear to be by a different author!
                                    author = author_gr['authorname']
                                    # this new authorname may already be in the
                                    # database, so check again
                                    check_exist_author = myDB.action(
                                        'SELECT * FROM authors where AuthorName="%s"' %
                                        author).fetchone()
                                    if not check_exist_author:
                                        logger.info(
                                            "Adding new author [%s]" %
                                            author)
                                        try:
                                            addAuthorToDB(author)
                                            check_exist_author = myDB.action(
                                                'SELECT * FROM authors where AuthorName="%s"' %
                                                author).fetchone()
                                        except:
                                            continue

                        # check author exists in db, either newly loaded or already there
                        if not check_exist_author:
                            logger.debug(
                                "Failed to match author [%s] in database" %
                                author)
                        else:
                            # author exists, check if this book by this author is in our database
                            # metadata might have quotes in book name
                            book = book.replace('"', '').replace("'", "")
                            bookid = find_book_in_db(myDB, author, book)

                            if bookid:
                                # check if book is already marked as "Open" (if so,
                                # we already had it)

                                check_status = myDB.action(
                                    'SELECT Status from books where BookID="%s"' %
                                    bookid).fetchone()
                                if check_status['Status'] != 'Open':
                                    # update status as we've got this book

                                    myDB.action(
                                        'UPDATE books set Status="Open" where BookID="%s"' %
                                        bookid)

                                    book_filename = os.path.join(r, files)

                                    # update book location so we can check if it
                                    # gets removed, or allow click-to-open

                                    myDB.action(
                                        'UPDATE books set BookFile="%s" where BookID="%s"' %
                                        (book_filename, bookid))

                                    # update cover file to cover.jpg in book folder (if exists)
                                    bookdir = book_filename.rsplit(os.sep, 1)[0]
                                    coverimg = os.path.join(bookdir, 'cover.jpg')
                                    cachedir = os.path.join(str(lazylibrarian.PROG_DIR), 'data' + os.sep + 'images' + os.sep + 'cache')
                                    cacheimg = os.path.join(cachedir, bookid + '.jpg')
                                    if os.path.isfile(coverimg):
                                        copyfile(coverimg, cacheimg)

                                    new_book_count += 1
                            else:
                                logger.debug(
                                    "Failed to match book [%s] by [%s] in database" %
                                    (book, author))


    logger.info("%s new/modified book%s found and added to the database" %
                (new_book_count, plural(new_book_count)))
    logger.info("%s file%s processed" % (file_count, plural(file_count)))

    # show statistics of full library scans
    if startdir == lazylibrarian.DESTINATION_DIR:
        stats = myDB.action(
            "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \
                sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached), sum(duplicates) FROM stats").fetchone()
        if stats['sum(GR_book_hits)'] is not None:
            # only show stats if new books added
            if lazylibrarian.BOOK_API == "GoogleBooks":
                logger.debug("GoogleBooks was hit %s time%s for books" %
                    (stats['sum(GR_book_hits)'], plural(stats['sum(GR_book_hits)'])))
                logger.debug("GoogleBooks language was changed %s time%s" %
                    (stats['sum(GB_lang_change)'], plural(stats['sum(GB_lang_change)'])))
            if lazylibrarian.BOOK_API == "GoodReads":
                logger.debug("GoodReads was hit %s time%s for books" %
                    (stats['sum(GR_book_hits)'], plural(stats['sum(GR_book_hits)'])))
                logger.debug("GoodReads was hit %s time%s for languages" %
                    (stats['sum(GR_lang_hits)'], plural(stats['sum(GR_lang_hits)'])))
            logger.debug("LibraryThing was hit %s time%s for languages" %
                (stats['sum(LT_lang_hits)'], plural (stats['sum(LT_lang_hits)'])))
            logger.debug("Language cache was hit %s time%s" %
                (stats['sum(cache_hits)'], plural(stats['sum(cache_hits)'])))
            logger.debug("Unwanted language removed %s book%s" %
                (stats['sum(bad_lang)'], plural (stats['sum(bad_lang)'])))
            logger.debug("Unwanted characters removed %s book%s" %
                (stats['sum(bad_char)'], plural(stats['sum(bad_char)'])))
            logger.debug("Unable to cache %s book%s with missing ISBN" %
                (stats['sum(uncached)'], plural(stats['sum(uncached)'])))
            logger.debug("Found %s duplicate book%s" %
                (stats['sum(duplicates)'], plural(stats['sum(duplicates)'])))
            logger.debug("Cache %s hit%s, %s miss" %
                (lazylibrarian.CACHE_HIT, plural(lazylibrarian.CACHE_HIT), lazylibrarian.CACHE_MISS))
            cachesize = myDB.action("select count('ISBN') as counter from languages").fetchone()
            logger.debug("ISBN Language cache holds %s entries" % cachesize['counter'])
            nolang = len(myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"'))
            if nolang:
                logger.warn("Found %s book%s in your library with unknown language" % (nolang, plural(nolang)))

        authors = myDB.select('select AuthorID from authors')
        # Update bookcounts for all authors, not just new ones - refresh may have located
        # new books for existing authors especially if switched provider gb/gr
    else:
        # single author/book import
        authors = myDB.select('select AuthorID from authors where AuthorName = "%s"' % author)

    logger.debug('Updating bookcounts for %i author%s' % (len(authors), plural(len(authors))))
    for author in authors:
        update_totals(author['AuthorID'])

    images = myDB.select('select bookid, bookimg, bookname from books where bookimg like "http%"')
    if len(images):
        logger.info("Caching cover%s for %i book%s" % (plural(len(images)), len(images), plural(len(images))))
        for item in images:
            bookid = item['bookid']
            bookimg = item['bookimg']
            bookname = item['bookname']
            newimg = cache_cover(bookid, bookimg)
            if newimg is not None:
                myDB.action('update books set BookImg="%s" where BookID="%s"' % (newimg, bookid))

    images = myDB.select('select AuthorID, AuthorImg, AuthorName from authors where AuthorImg like "http%"')
    if len(images):
        logger.info("Caching image%s for %i author%s" % (plural(len(images)), len(images), plural(len(images))))
        for item in images:
            authorid = item['authorid']
            authorimg = item['authorimg']
            authorname = item['authorname']
            newimg = cache_cover(authorid, authorimg)
            if newimg is not None:
                myDB.action('update authors set AuthorImg="%s" where AuthorID="%s"' % (newimg, authorid))
    setWorkPages()
    logger.info('Library scan complete')
    return new_book_count

Exemplo n.º 19

0

Exibir arquivo

def addAuthorNameToDB(author=None, refresh=False, addbooks=True):
    # get authors name in a consistent format, look them up in the database
    # if not in database, try to import them.
    # return authorname,new where new=False if author already in db, new=True if added
    # authorname returned is our preferred name, or empty string if not found or unable to add
    myDB = database.DBConnection()
    new = False
    if len(author) < 2:
        logger.debug('Invalid Author Name [%s]' % author)
        return "", "", False

    author = formatAuthorName(author)
    # Check if the author exists, and import the author if not,
    check_exist_author = myDB.match(
        'SELECT AuthorID FROM authors where AuthorName="%s"' %
        author.replace('"', '""'))

    if not check_exist_author and lazylibrarian.CONFIG['ADD_AUTHOR']:
        logger.debug('Author %s not found in database, trying to add' % author)
        # no match for supplied author, but we're allowed to add new ones
        GR = GoodReads(author)
        try:
            author_gr = GR.find_author_id()
        except Exception as e:
            logger.warn("Error finding author id for [%s] %s" %
                        (author, str(e)))
            return "", "", False

        # only try to add if GR data matches found author data
        if author_gr:
            authorname = author_gr['authorname']
            #authorid = author_gr['authorid']
            # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
            match_auth = author.replace('.', ' ')
            match_auth = ' '.join(match_auth.split())

            match_name = authorname.replace('.', ' ')
            match_name = ' '.join(match_name.split())

            match_name = unaccented(match_name)
            match_auth = unaccented(match_auth)

            # allow a degree of fuzziness to cater for different accented character handling.
            # some author names have accents,
            # filename may have the accented or un-accented version of the character
            # The currently non-configurable value of fuzziness might need to go in config
            # We stored GoodReads unmodified author name in
            # author_gr, so store in LL db under that
            # fuzz.ratio doesn't lowercase for us
            match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower())
            if match_fuzz < 90:
                logger.debug(
                    "Failed to match author [%s] to authorname [%s] fuzz [%d]"
                    % (author, match_name, match_fuzz))

            # To save loading hundreds of books by unknown authors at GR or GB, ignore unknown
            if (author != "Unknown") and (match_fuzz >= 90):
                # use "intact" name for author that we stored in
                # GR author_dict, not one of the various mangled versions
                # otherwise the books appear to be by a different author!
                author = author_gr['authorname']
                authorid = author_gr['authorid']
                # this new authorname may already be in the
                # database, so check again
                check_exist_author = myDB.match(
                    'SELECT AuthorID FROM authors where AuthorID="%s"' %
                    authorid)
                if check_exist_author:
                    logger.debug('Found goodreads authorname %s in database' %
                                 author)
                else:
                    logger.info("Adding new author [%s]" % author)
                    try:
                        addAuthorToDB(authorname=author,
                                      refresh=refresh,
                                      authorid=authorid,
                                      addbooks=addbooks)
                        check_exist_author = myDB.match(
                            'SELECT AuthorID FROM authors where AuthorID="%s"'
                            % authorid)
                        if check_exist_author:
                            new = True
                    except Exception:
                        logger.debug('Failed to add author [%s] to db' %
                                     author)
    # check author exists in db, either newly loaded or already there
    if not check_exist_author:
        logger.debug("Failed to match author [%s] in database" % author)
        return "", "", False
    return author, check_exist_author['AuthorID'], new

Exemplo n.º 20

0

Exibir arquivo

    def find_results(self, searchterm=None, queue=None):
        try:
            resultlist = []
            api_hits = 0
            # we don't use the title/author separator in goodreads
            searchterm = searchterm.replace(' <ll> ', '')

            url = urllib.quote_plus(searchterm.encode(lazylibrarian.SYS_ENCODING))
            set_url = 'http://www.goodreads.com/search.xml?q=' + url + '&' + urllib.urlencode(self.params)
            logger.debug('Now searching GoodReads API with searchterm: %s' % searchterm)
            #logger.debug('Searching for %s at: %s' % (searchterm, set_url))

            resultcount = 0
            try:
                try:
                    rootxml, in_cache = get_xml_request(set_url)
                except Exception as e:
                    logger.error("Error finding gr results: %s" % str(e))
                    return
                if not len(rootxml):
                    logger.debug("Error requesting results")
                    return

                resultxml = rootxml.getiterator('work')
                for author in resultxml:

                    if author.find('original_publication_year').text is None:
                        bookdate = "0000"
                    else:
                        bookdate = author.find('original_publication_year').text

                    authorNameResult = author.find('./best_book/author/name').text
                    # Goodreads sometimes puts extra whitepase in the author names!
                    authorNameResult =  ' '.join(authorNameResult.split())
                    booksub = ""
                    bookpub = ""
                    booklang = "Unknown"

                    try:
                        bookimg = author.find('./best_book/image_url').text
                        if bookimg == 'http://www.goodreads.com/assets/nocover/111x148.png':
                            bookimg = 'images/nocover.png'
                    except (KeyError, AttributeError):
                        bookimg = 'images/nocover.png'

                    try:
                        bookrate = author.find('average_rating').text
                    except KeyError:
                        bookrate = 0

                    bookpages = '0'
                    bookgenre = ''
                    bookdesc = ''
                    bookisbn = ''
                    booklink = 'http://www.goodreads.com/book/show/' + author.find('./best_book/id').text

                    if author.find('./best_book/title').text is None:
                        bookTitle = ""
                    else:
                        bookTitle = author.find('./best_book/title').text

                    author_fuzz = fuzz.ratio(authorNameResult, searchterm)
                    book_fuzz = fuzz.ratio(bookTitle, searchterm)
                    isbn_fuzz = 0
                    if is_valid_isbn(searchterm):
                            isbn_fuzz = 100

                    highest_fuzz = max((author_fuzz + book_fuzz) / 2, isbn_fuzz)

                    bookid = author.find('./best_book/id').text

                    resultlist.append({
                        'authorname': author.find('./best_book/author/name').text,
                        'bookid': bookid,
                        'authorid': author.find('./best_book/author/id').text,
                        'bookname': bookTitle.encode("ascii", "ignore"),
                        'booksub': booksub,
                        'bookisbn': bookisbn,
                        'bookpub': bookpub,
                        'bookdate': bookdate,
                        'booklang': booklang,
                        'booklink': booklink,
                        'bookrate': float(bookrate),
                        'bookimg': bookimg,
                        'bookpages': bookpages,
                        'bookgenre': bookgenre,
                        'bookdesc': bookdesc,
                        'author_fuzz': author_fuzz,
                        'book_fuzz': book_fuzz,
                        'isbn_fuzz': isbn_fuzz,
                        'highest_fuzz': highest_fuzz,
                        'num_reviews': float(bookrate)
                    })

                    resultcount += 1

            except urllib2.HTTPError as err:
                if err.code == 404:
                    logger.error('Received a 404 error when searching for author')
                if err.code == 403:
                    logger.warn('Access to api is denied: usage exceeded')
                else:
                    logger.error('An unexpected error has occurred when searching for an author: %s' % str(err))

            logger.debug('Found %s result%s with keyword: %s' % (resultcount, plural(resultcount), searchterm))
            logger.debug(
                'The GoodReads API was hit %s time%s for keyword %s' % (api_hits, plural(api_hits), searchterm))

            queue.put(resultlist)

        except Exception:
            logger.error('Unhandled exception in GR.find_results: %s' % traceback.format_exc())

Exemplo n.º 21

0

Exibir arquivo

Arquivo: librarysync.py Projeto: Trixter69/LazyLibrarian

def LibraryScan(dir=None):
    if not dir:
        if not lazylibrarian.DOWNLOAD_DIR:
            return
        else:
            dir = lazylibrarian.DOWNLOAD_DIR

    if not os.path.isdir(dir):
        logger.warn(
            'Cannot find directory: %s. Not scanning' %
            dir.decode(lazylibrarian.SYS_ENCODING, 'replace'))
        return

    myDB = database.DBConnection()

    myDB.action('drop table if exists stats')
    myDB.action(
        'create table stats ( authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )')

    new_authors = []

    logger.info(
        'Scanning ebook directory: %s' %
        dir.decode(lazylibrarian.SYS_ENCODING, 'replace'))

    new_book_count = 0
    file_count = 0

    if lazylibrarian.FULL_SCAN:
        books = myDB.select(
            'select AuthorName, BookName, BookFile, BookID from books where Status="Open"')
        status = lazylibrarian.NOTFOUND_STATUS
        logger.info('Missing books will be marked as %s' % status)
        for book in books:
            bookName = book['BookName']
            bookAuthor = book['AuthorName']
            bookID = book['BookID']
            bookfile = book['BookFile']

            if not(bookfile and os.path.isfile(bookfile)):
                myDB.action(
                    'update books set Status="%s" where BookID="%s"' %
                    (status, bookID))
                myDB.action(
                    'update books set BookFile="" where BookID="%s"' %
                    bookID)
                logger.warn(
                    'Book %s - %s updated as not found on disk' %
                    (bookAuthor, bookName))
            # for book_type in getList(lazylibrarian.EBOOK_TYPE):
            #	bookName = book['BookName']
            #	bookAuthor = book['AuthorName']
            # Default destination path, should be allowed change per config file.
            #	dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', bookAuthor).replace('$Title', bookName)
            # dest_path = authorname+'/'+bookname
            #	global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', bookAuthor).replace('$Title', bookName)
#
            #	encoded_book_path = os.path.join(dir,dest_path,global_name + "." + book_type).encode(lazylibrarian.SYS_ENCODING)
            #	if os.path.isfile(encoded_book_path):
            #		book_exists = True
            # if not book_exists:
            #	myDB.action('update books set Status=? where AuthorName=? and BookName=?',[status,bookAuthor,bookName])
            # logger.info('Book %s updated as not found on disk' %
            # encoded_book_path.decode(lazylibrarian.SYS_ENCODING, 'replace') )
                if bookAuthor not in new_authors:
                    new_authors.append(bookAuthor)

    # guess this was meant to save repeat-scans of the same directory
    # if it contains multiple formats of the same book, but there was no code
    # that looked at the array. renamed from latest to processed to make
    # purpose clearer
    processed_subdirectories = []

    matchString = ''
    for char in lazylibrarian.EBOOK_DEST_FILE:
        matchString = matchString + '\\' + char
    # massage the EBOOK_DEST_FILE config parameter into something we can use
    # with regular expression matching
    booktypes = ''
    count = -1
    booktype_list = formatter.getList(lazylibrarian.EBOOK_TYPE)
    for book_type in booktype_list:
        count += 1
        if count == 0:
            booktypes = book_type
        else:
            booktypes = booktypes + '|' + book_type
    matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace(
        "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']'
    pattern = re.compile(matchString, re.VERBOSE)

    for r, d, f in os.walk(dir):
        for directory in d[:]:
            if directory.startswith("."):
                d.remove(directory)
            # prevent magazine being scanned
            if directory.startswith("_"):
                d.remove(directory)
        for files in f:
            file_count += 1
            subdirectory = r.replace(dir, '')
            # Added new code to skip if we've done this directory before. Made this conditional with a switch in config.ini
            # in case user keeps multiple different books in the same
            # subdirectory
            if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories):
                logger.debug("[%s] already scanned" % subdirectory)
            else:
# 			If this is a book, try to get author/title/isbn/language
# 			If metadata.opf exists, use that
# 			else if epub or mobi, read metadata from the book
# 			else have to try pattern match for author/title	and look up isbn/lang from LT or GR late
                match = 0
                extn = ""
                
                if '.' in files:
                    words = files.split('.')
                    extn = words[len(words) - 1]
                    
                if formatter.is_valid_booktype(files):
                    logger.debug(
                        "[%s] Now scanning subdirectory %s" %
                        (dir.decode(lazylibrarian.SYS_ENCODING, 'replace'), subdirectory.decode(lazylibrarian.SYS_ENCODING, 'replace')))
                    
                    # calibre uses "metadata.opf", LL uses "bookname - authorname.opf"
                    # just look for any .opf file in the current directory since we don't know 
                    # LL preferred authorname/bookname at this point
                    metafile = opf_file(r)
                    try:
                        res = get_book_info(metafile)
                    except:
                        res = {}
                    if 'title' in res and 'creator' in res:  # this is the minimum we need
                        book = res['title']
                        author = res['creator']
                        if 'language' in res:
                            language = res['language']
                        else:
                            language = ""
                        if 'identifier' in res:
                            isbn = res['identifier']
                        else:
                            isbn = ""
                        match = 1
                        logger.debug(
                            "file meta [%s] [%s] [%s] [%s]" %
                            (isbn, language, author, book))
                    else:
                        logger.debug("File meta incomplete in %s" % metafile)

                    if not match:
                        # it's a book, but no external metadata found
                        # if it's an epub or a mobi we can try to read metadata
                        # from it
                        if (extn == "epub") or (extn == "mobi"):
                            book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING)
                            try:
                                res = get_book_info(book_filename)
                            except:
                                res = {}
                            if 'title' in res and 'creator' in res:  # this is the minimum we need
                                book = res['title']
                                author = res['creator']
                                if 'language' in res:
                                    language = res['language']
                                else:
                                    language = ""
                                if 'identifier' in res:
                                    isbn = res['identifier']
                                else:
                                    isbn = ""
                                logger.debug("book meta [%s] [%s] [%s] [%s]" %
                                    (isbn, language, author, book))
                                match = 1
                            else:
                                logger.debug("Book meta incomplete in %s" % book_filename)

                if not match:
                    match = pattern.match(files)
                    if match:
                        author = match.group("author")
                        book = match.group("book")
                    else:
                        logger.debug("Pattern match failed [%s]" % files)

                if match:
                    processed_subdirectories.append(
                        subdirectory)  # flag that we found a book in this subdirectory
                    #
                    # If we have a valid looking isbn, and language != "Unknown", add it to cache
                    #
                    if not language:
                        language = "Unknown"

                    if not formatter.is_valid_isbn(isbn):
                        isbn = ""
                    if isbn != "" and language != "Unknown":
                        logger.debug(
                            "Found Language [%s] ISBN [%s]" %
                            (language, isbn))
                        # we need to add it to language cache if not already
                        # there, is_valid_isbn has checked length is 10 or 13
                        if len(isbn) == 10:
                            isbnhead = isbn[0:3]
                        else:
                            isbnhead = isbn[3:6]
                        match = myDB.action(
                            'SELECT lang FROM languages where isbn = "%s"' %
                            (isbnhead)).fetchone()
                        if not match:
                            myDB.action(
                                'insert into languages values ("%s", "%s")' %
                                (isbnhead, language))
                            logger.debug(
                                "Cached Lang [%s] ISBN [%s]" %
                                (language, isbnhead))
                        else:
                            logger.debug(
                                "Already cached Lang [%s] ISBN [%s]" %
                                (language, isbnhead))

                    # get authors name in a consistent format
                    if "," in author:  # "surname, forename"
                        words = author.split(',')
                        author = words[1].strip() + ' ' + words[0].strip()  # "forename surname"
                    if author[1] == ' ':        
                        author = author.replace(' ', '.')
                        author = author.replace('..', '.')

                    # Check if the author exists, and import the author if not,
                    # before starting any complicated book-name matching to save repeating the search
                    #
                    check_exist_author = myDB.action(
                        'SELECT * FROM authors where AuthorName="%s"' %
                        author).fetchone()
                    if not check_exist_author and lazylibrarian.ADD_AUTHOR:
                        # no match for supplied author, but we're allowed to
                        # add new ones

                        GR = GoodReads(author)
                        try:
                            author_gr = GR.find_author_id()
                        except:
                            logger.warn(
                                "Error finding author id for [%s]" %
                                author)
                            continue

                        # only try to add if GR data matches found author data
                        # not sure what this is for, never seems to fail??
                        if author_gr:
                            authorname = author_gr['authorname']

                            # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
                            match_auth = author.replace('.', '_')
                            match_auth = match_auth.replace(' ', '_')
                            match_auth = match_auth.replace('__', '_')
                            match_name = authorname.replace('.', '_')
                            match_name = match_name.replace(' ', '_')
                            match_name = match_name.replace('__', '_')
                            match_name = common.remove_accents(match_name)
                            match_auth = common.remove_accents(match_auth)
                            # allow a degree of fuzziness to cater for different accented character handling.
                            # some author names have accents,
                            # filename may have the accented or un-accented version of the character
                            # The currently non-configurable value of fuzziness might need to go in config
                            # We stored GoodReads unmodified author name in
                            # author_gr, so store in LL db under that
                            match_fuzz = fuzz.ratio(match_auth, match_name)
                            if match_fuzz < 90:
                                logger.debug(
                                    "Failed to match author [%s] fuzz [%d]" %
                                    (author, match_fuzz))
                                logger.debug(
                                    "Failed to match author [%s] to authorname [%s]" %
                                    (match_auth, match_name))

                            # To save loading hundreds of books by unknown
                            # authors at GR or GB, ignore if author "Unknown"
                            if (author != "Unknown") and (match_fuzz >= 90):
                                # use "intact" name for author that we stored in
                                # GR author_dict, not one of the various mangled versions
                                # otherwise the books appear to be by a
                                # different author!
                                author = author_gr['authorname']
                                # this new authorname may already be in the
                                # database, so check again
                                check_exist_author = myDB.action(
                                    'SELECT * FROM authors where AuthorName="%s"' %
                                    author).fetchone()
                                if not check_exist_author:
                                    logger.debug(
                                        "Adding new author [%s]" %
                                        author)
                                    if author not in new_authors:
                                        new_authors.append(author)
                                    try:
                                        importer.addAuthorToDB(author)
                                        check_exist_author = myDB.action(
                                            'SELECT * FROM authors where AuthorName="%s"' %
                                            author).fetchone()
                                    except:
                                        continue

                    # check author exists in db, either newly loaded or already
                    # there
                    if not check_exist_author:
                        logger.debug(
                            "Failed to match author [%s] in database" %
                            author)
                    else:
                        # author exists, check if this book by this author is in our database
                        # metadata might have quotes in book name
                        book = book.replace('"', '').replace("'", "")
                        bookid = find_book_in_db(myDB, author, book)
                        if bookid:
                            # check if book is already marked as "Open" (if so,
                            # we already had it)
                            check_status = myDB.action(
                                'SELECT Status from books where BookID="%s"' %
                                bookid).fetchone()
                            if check_status['Status'] != 'Open':
                                # update status as we've got this book
                                myDB.action(
                                    'UPDATE books set Status="Open" where BookID="%s"' %
                                    bookid)
                                book_filename = os.path.join(
                                    r,
                                    files).encode(
                                        lazylibrarian.SYS_ENCODING)
                                # update book location so we can check if it
                                # gets removed, or allow click-to-open
                                myDB.action(
                                    'UPDATE books set BookFile="%s" where BookID="%s"' %
                                    (book_filename, bookid))
                                new_book_count += 1

    cachesize = myDB.action("select count(*) from languages").fetchone()
    logger.info(
        "%s new/modified books found and added to the database" %
        new_book_count)
    logger.info("%s files processed" % file_count)
    if new_book_count:
        stats = myDB.action(
            "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats").fetchone()
        if lazylibrarian.BOOK_API == "GoogleBooks":
            logger.debug(
                "GoogleBooks was hit %s times for books" %
                stats['sum(GR_book_hits)'])
            logger.debug(
                "GoogleBooks language was changed %s times" %
                stats['sum(GB_lang_change)'])
        if lazylibrarian.BOOK_API == "GoodReads":
            logger.debug(
                "GoodReads was hit %s times for books" %
                stats['sum(GR_book_hits)'])
            logger.debug(
                "GoodReads was hit %s times for languages" %
                stats['sum(GR_lang_hits)'])
        logger.debug(
            "LibraryThing was hit %s times for languages" %
            stats['sum(LT_lang_hits)'])
        logger.debug(
            "Language cache was hit %s times" %
            stats['sum(cache_hits)'])
        logger.debug(
            "Unwanted language removed %s books" %
            stats['sum(bad_lang)'])
        logger.debug(
            "Unwanted characters removed %s books" %
            stats['sum(bad_char)'])
        logger.debug(
            "Unable to cache %s books with missing ISBN" %
            stats['sum(uncached)'])
    logger.debug("ISBN Language cache holds %s entries" % cachesize['count(*)'])
    stats = len(
        myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"'))
    if stats:
        logger.warn(
            "There are %s books in your library with unknown language" %
            stats)

    logger.debug('Updating %i authors' % len(new_authors))
    for auth in new_authors:
        havebooks = len(
            myDB.select('select BookName from Books where status="%s" and AuthorName="%s"' %
                        ('Open', auth)))
        myDB.action(
            'UPDATE authors set HaveBooks="%s" where AuthorName="%s"' %
            (havebooks, auth))
        totalbooks = len(
            myDB.select('select BookName from Books where status!="%s" and AuthorName="%s"' %
                        ('Ignored', auth)))
        myDB.action(
            'UPDATE authors set UnignoredBooks="%s" where AuthorName="%s"' %
            (totalbooks, auth))

    logger.info('Library scan complete')

Exemplo n.º 22

0

Exibir arquivo

Arquivo: importer.py Projeto: DobyTang/LazyLibrarian

def addAuthorNameToDB(author=None, refresh=False, addbooks=True):
    # get authors name in a consistent format, look them up in the database
    # if not in database, try to import them.
    # return authorname,authorid,new where new=False if author already in db, new=True if added
    # authorname returned is our preferred name, or empty string if not found or unable to add

    new = False
    if not author or len(author) < 2:
        logger.debug('Invalid Author Name [%s]' % author)
        return "", "", False

    author = formatAuthorName(author)
    myDB = database.DBConnection()

    # Check if the author exists, and import the author if not,
    check_exist_author = myDB.match('SELECT AuthorID FROM authors where AuthorName=?', (author,))

    # If no exact match, look for a close fuzzy match to handle misspellings, accents
    if not check_exist_author:
        match_name = author.lower()
        res = myDB.action('select AuthorID,AuthorName from authors')
        for item in res:
            aname = item['AuthorName']
            if aname:
                match_fuzz = fuzz.ratio(aname.lower(), match_name)
                if match_fuzz >= 95:
                    logger.debug("Fuzzy match [%s] %s%% for [%s]" % (item['AuthorName'], match_fuzz, author))
                    check_exist_author = item
                    author = item['AuthorName']
                    break

    if not check_exist_author and lazylibrarian.CONFIG['ADD_AUTHOR']:
        logger.debug('Author %s not found in database, trying to add' % author)
        # no match for supplied author, but we're allowed to add new ones
        GR = GoodReads(author)
        try:
            author_gr = GR.find_author_id()
        except Exception as e:
            logger.warn("%s finding author id for [%s] %s" % (type(e).__name__, author, str(e)))
            return "", "", False

        # only try to add if GR data matches found author data
        if author_gr:
            authorname = author_gr['authorname']
            # authorid = author_gr['authorid']
            # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
            match_auth = author.replace('.', ' ')
            match_auth = ' '.join(match_auth.split())

            match_name = authorname.replace('.', ' ')
            match_name = ' '.join(match_name.split())

            match_name = unaccented(match_name)
            match_auth = unaccented(match_auth)

            # allow a degree of fuzziness to cater for different accented character handling.
            # some author names have accents,
            # filename may have the accented or un-accented version of the character
            # The currently non-configurable value of fuzziness might need to go in config
            # We stored GoodReads unmodified author name in
            # author_gr, so store in LL db under that
            # fuzz.ratio doesn't lowercase for us
            match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower())
            if match_fuzz < 90:
                logger.debug("Failed to match author [%s] to authorname [%s] fuzz [%d]" %
                             (author, match_name, match_fuzz))

            # To save loading hundreds of books by unknown authors at GR or GB, ignore unknown
            if (author != "Unknown") and (match_fuzz >= 90):
                # use "intact" name for author that we stored in
                # GR author_dict, not one of the various mangled versions
                # otherwise the books appear to be by a different author!
                author = author_gr['authorname']
                authorid = author_gr['authorid']
                # this new authorname may already be in the
                # database, so check again
                check_exist_author = myDB.match('SELECT AuthorID FROM authors where AuthorID=?', (authorid,))
                if check_exist_author:
                    logger.debug('Found goodreads authorname %s in database' % author)
                else:
                    logger.info("Adding new author [%s]" % author)
                    try:
                        addAuthorToDB(authorname=author, refresh=refresh, authorid=authorid, addbooks=addbooks)
                        check_exist_author = myDB.match('SELECT AuthorID FROM authors where AuthorID=?', (authorid,))
                        if check_exist_author:
                            new = True
                    except Exception as e:
                        logger.error('Failed to add author [%s] to db: %s %s' % (author, type(e).__name__, str(e)))
    # check author exists in db, either newly loaded or already there
    if not check_exist_author:
        logger.debug("Failed to match author [%s] in database" % author)
        return "", "", False
    author = makeUnicode(author)
    return author, check_exist_author['AuthorID'], new