Python remove_accents 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: lazylibrarian.common

메소드/함수: remove_accents

hotexamples.com에서의 예제들: 16

Python remove_accents - 16개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 lazylibrarian.common.remove_accents에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: __init__.py 프로젝트: andrzejc/LazyLibrarian

def build_monthtable():
    if len(formatter.getList(IMP_MONTHLANG)) == 0:  # any extra languages wanted?
        return
    try:
        current_locale = locale.setlocale(locale.LC_ALL, "")  # read current state.
        # getdefaultlocale() doesnt seem to work as expected on windows, returns 'None'
    except locale.Error as e:
        logger.debug("Error getting current locale : %s" % str(e))
        return

    lang = str(current_locale)
    if not lang.startswith("en_"):  # en_ is preloaded
        MONTHNAMES[0].append(lang)
        for f in range(1, 13):
            MONTHNAMES[f].append(common.remove_accents(calendar.month_name[f]).lower())
        MONTHNAMES[0].append(lang)
        for f in range(1, 13):
            MONTHNAMES[f].append(common.remove_accents(calendar.month_abbr[f]).lower().strip("."))
            logger.info(
                "Added month names for locale [%s], %s, %s ..."
                % (lang, MONTHNAMES[1][len(MONTHNAMES[1]) - 2], MONTHNAMES[1][len(MONTHNAMES[1]) - 1])
            )

    for lang in formatter.getList(IMP_MONTHLANG):
        try:
            if len(lang) > 1:
                locale.setlocale(locale.LC_ALL, lang)
                MONTHNAMES[0].append(lang)
                for f in range(1, 13):
                    MONTHNAMES[f].append(common.remove_accents(calendar.month_name[f]).lower())
                MONTHNAMES[0].append(lang)
                for f in range(1, 13):
                    MONTHNAMES[f].append(common.remove_accents(calendar.month_abbr[f]).lower().strip("."))
                locale.setlocale(locale.LC_ALL, current_locale)  # restore entry state
                logger.info(
                    "Added month names for locale [%s], %s, %s ..."
                    % (lang, MONTHNAMES[1][len(MONTHNAMES[1]) - 2], MONTHNAMES[1][len(MONTHNAMES[1]) - 1])
                )
        except:
            locale.setlocale(locale.LC_ALL, current_locale)  # restore entry state
            logger.warn("Unable to load requested locale [%s]" % lang)
            try:
                if "_" in lang:
                    wanted_lang = lang.split("_")[0]
                else:
                    wanted_lang = lang
                params = ["locale", "-a"]
                all_locales = subprocess.check_output(params).split()
                locale_list = []
                for a_locale in all_locales:
                    if a_locale.startswith(wanted_lang):
                        locale_list.append(a_locale)
                if locale_list:
                    logger.warn("Found these alternatives: " + str(locale_list))
                else:
                    logger.warn("Unable to find an alternative")
            except:
                logger.warn("Unable to get a list of alternatives")
            logger.info("Set locale back to entry state %s" % current_locale)

예제 #2

파일 보기

파일: __init__.py 프로젝트: Trixter69/LazyLibrarian

def build_monthtable():
    current_locale = locale.setlocale(locale.LC_ALL, "")  # read current state.
    # getdefaultlocale() doesnt seem to work as expected on windows, returns 'None'
    # ensure current locale is in the list...
    # actually I'm not sure if this is a good idea. I Added this as my Raspberry Pi
    # defaults to en_GB and does not have en_US loaded, but it's probably better in
    # this case for the user to put en_GB in the config setting instead of en_US??
    # Or have an empty config setting?
    # Or at least remove en_US from the config list so we don't check the same names twice?
    #
    lang = str(current_locale)
    MONTHNAMES[0].append(lang)
    for f in range(1, 13):
        MONTHNAMES[f].append(common.remove_accents(calendar.month_name[f]).lower())
    MONTHNAMES[0].append(lang)
    for f in range(1, 13):
        MONTHNAMES[f].append(common.remove_accents(calendar.month_abbr[f]).lower().strip("."))
    logger.info(
        "Added month names for locale [%s], %s, %s ..."
        % (lang, MONTHNAMES[1][len(MONTHNAMES[1]) - 2], MONTHNAMES[1][len(MONTHNAMES[1]) - 1])
    )

    for lang in IMP_MONTHLANG.split(","):
        try:
            lang = str(lang).strip()
            if len(lang) > 1:
                locale.setlocale(locale.LC_ALL, lang)
                MONTHNAMES[0].append(lang)
                for f in range(1, 13):
                    MONTHNAMES[f].append(common.remove_accents(calendar.month_name[f]).lower())
                MONTHNAMES[0].append(lang)
                for f in range(1, 13):
                    MONTHNAMES[f].append(common.remove_accents(calendar.month_abbr[f]).lower().strip("."))
                locale.setlocale(locale.LC_ALL, current_locale)  # restore entry state
                logger.info(
                    "Added month names for locale [%s], %s, %s ..."
                    % (lang, MONTHNAMES[1][len(MONTHNAMES[1]) - 2], MONTHNAMES[1][len(MONTHNAMES[1]) - 1])
                )
        except:
            locale.setlocale(locale.LC_ALL, current_locale)  # restore entry state
            logger.warn("Unable to load requested locale [%s]" % lang)
            logger.info("Set locale back to entry state %s" % current_locale)
    # quick sanity check, warn if no english names in table
    eng = 0
    for lang in MONTHNAMES[0]:
        if lang.startswith("en_"):
            eng = 1
    if not eng:
        logger.warn("No English language loaded - Magazine name matching will probably fail")

예제 #3

파일 보기

파일: postprocess.py 프로젝트: dmfinley/LazyLibrarian

def import_book(pp_path=None, bookID=None):

    # Separated this into a function so we can more easily import books from an alternate directory
    # and move them into LL folder structure given just the bookID, returns True or False
    # eg if import_book(source_directory, bookID):
    #         ppcount = ppcount + 1
    #
    myDB = database.DBConnection()
    data = myDB.select('SELECT * from books WHERE BookID="%s"' % bookID)
    if data:
        authorname = data[0]['AuthorName']
        bookname = data[0]['BookName']

        # try:
        #    auth_dir = os.path.join(lazylibrarian.DESTINATION_DIR, authorname).encode(lazylibrarian.SYS_ENCODING)
        #    os.chmod(auth_dir, 0777)
        # except Exception, e:
        #    logger.debug("Could not chmod author directory: " + str(auth_dir))

        if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER:
            logger.warn('Please check your EBOOK_DEST_FOLDER setting')
            lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\')

        dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace('$Title', bookname)
        global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace('$Title', bookname)
        global_name = common.remove_accents(global_name)
        # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR
        # as windows drive identifiers have colon, eg c:  but no colons allowed elsewhere?
        dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's',
               ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''}
        dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic))
        dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING)

        processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name)

        if processBook:
            # update nzbs
            controlValueDict = {"BookID": bookID}
            newValueDict = {"Status": "Processed", "NZBDate": formatter.now()}  # say when we processed it
            myDB.upsert("wanted", newValueDict, controlValueDict)
            processExtras(myDB, dest_path, global_name, data)
            logger.info('Successfully processed: %s' % global_name)
            notifiers.notify_download(formatter.latinToAscii(global_name) + ' at ' + formatter.now())
            return True
        else:
            logger.error('Postprocessing for %s has failed.' % global_name)
            logger.error('Warning - Residual files remain in %s.fail' % pp_path)
            try:
                os.rename(pp_path, pp_path + '.fail')
            except:
                logger.debug("Unable to rename %s" % pp_path)
            return False

예제 #4

파일 보기

def import_book(pp_path=None, bookID=None):

    # Separated this into a function so we can more easily import books from an alternate directory
    # and move them into LL folder structure given just the bookID, returns True or False
    # eg if import_book(source_directory, bookID):
    #         ppcount = ppcount + 1
    #
    myDB = database.DBConnection()
    data = myDB.select('SELECT * from books WHERE BookID="%s"' % bookID)
    if data:
        authorname = data[0]['AuthorName']
        bookname = data[0]['BookName']

        # try:
        #    auth_dir = os.path.join(lazylibrarian.DESTINATION_DIR, authorname).encode(lazylibrarian.SYS_ENCODING)
        #    os.chmod(auth_dir, 0777)
        # except Exception, e:
        #    logger.debug("Could not chmod author directory: " + str(auth_dir))

        if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER:
            logger.warn('Please check your EBOOK_DEST_FOLDER setting')
            lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\')

        dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace('$Title', bookname)
        global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace('$Title', bookname)
        global_name = common.remove_accents(global_name)
        # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR
        # as windows drive identifiers have colon, eg c:  but no colons allowed elsewhere?
        dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's',
               ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''}
        dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic))
        dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(lazylibrarian.SYS_ENCODING)

        processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name)

        if processBook:
            # update nzbs
            controlValueDict = {"BookID": bookID}
            newValueDict = {"Status": "Processed", "NZBDate": formatter.now()}  # say when we processed it
            myDB.upsert("wanted", newValueDict, controlValueDict)
            processExtras(myDB, dest_path, global_name, data)
            logger.info('Successfully processed: %s' % global_name)
            notifiers.notify_download(formatter.latinToAscii(global_name) + ' at ' + formatter.now())
            return True
        else:
            logger.error('Postprocessing for %s has failed.' % global_name)
            logger.error('Warning - Residual files remain in %s.fail' % pp_path)
            try:
                os.rename(pp_path, pp_path + '.fail')
            except:
                logger.debug("Unable to rename %s" % pp_path)
            return False

예제 #5

파일 보기

파일: pushbullet.py 프로젝트: AndyHazz/LazyLibrarian

    def _notify(self, message=None, event=None, pushbullet_token=None, pushbullet_deviceid=None):
        """
        Sends a pushbullet notification based on the provided info or LL config

        title: The title of the notification to send
        message: The message string to send
        username: The username to send the notification to (optional, defaults to the username in the config)
        force: If True then the notification will be sent even if pushbullet is disabled in the config
        """
        try:
            message = common.remove_accents(message)
        except Exception, e:
            logger.warn("Pushbullet: could not convert  message: %s" % e)

예제 #6

파일 보기

파일: pushbullet.py 프로젝트: timctrahan/LazyLibrarian

    def _notify(self,
                message=None,
                event=None,
                pushbullet_token=None,
                pushbullet_deviceid=None):
        """
        Sends a pushbullet notification based on the provided info or LL config

        title: The title of the notification to send
        message: The message string to send
        username: The username to send the notification to (optional, defaults to the username in the config)
        force: If True then the notification will be sent even if pushbullet is disabled in the config
        """
        try:
            message = common.remove_accents(message)
        except Exception, e:
            logger.warn("Pushbullet: could not convert  message: %s" % e)

예제 #7

파일 보기

파일: librarysync.py 프로젝트: Trixter69/LazyLibrarian

def LibraryScan(dir=None):
    if not dir:
        if not lazylibrarian.DOWNLOAD_DIR:
            return
        else:
            dir = lazylibrarian.DOWNLOAD_DIR

    if not os.path.isdir(dir):
        logger.warn(
            'Cannot find directory: %s. Not scanning' %
            dir.decode(lazylibrarian.SYS_ENCODING, 'replace'))
        return

    myDB = database.DBConnection()

    myDB.action('drop table if exists stats')
    myDB.action(
        'create table stats ( authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )')

    new_authors = []

    logger.info(
        'Scanning ebook directory: %s' %
        dir.decode(lazylibrarian.SYS_ENCODING, 'replace'))

    new_book_count = 0
    file_count = 0

    if lazylibrarian.FULL_SCAN:
        books = myDB.select(
            'select AuthorName, BookName, BookFile, BookID from books where Status="Open"')
        status = lazylibrarian.NOTFOUND_STATUS
        logger.info('Missing books will be marked as %s' % status)
        for book in books:
            bookName = book['BookName']
            bookAuthor = book['AuthorName']
            bookID = book['BookID']
            bookfile = book['BookFile']

            if not(bookfile and os.path.isfile(bookfile)):
                myDB.action(
                    'update books set Status="%s" where BookID="%s"' %
                    (status, bookID))
                myDB.action(
                    'update books set BookFile="" where BookID="%s"' %
                    bookID)
                logger.warn(
                    'Book %s - %s updated as not found on disk' %
                    (bookAuthor, bookName))
            # for book_type in getList(lazylibrarian.EBOOK_TYPE):
            #	bookName = book['BookName']
            #	bookAuthor = book['AuthorName']
            # Default destination path, should be allowed change per config file.
            #	dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', bookAuthor).replace('$Title', bookName)
            # dest_path = authorname+'/'+bookname
            #	global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', bookAuthor).replace('$Title', bookName)
#
            #	encoded_book_path = os.path.join(dir,dest_path,global_name + "." + book_type).encode(lazylibrarian.SYS_ENCODING)
            #	if os.path.isfile(encoded_book_path):
            #		book_exists = True
            # if not book_exists:
            #	myDB.action('update books set Status=? where AuthorName=? and BookName=?',[status,bookAuthor,bookName])
            # logger.info('Book %s updated as not found on disk' %
            # encoded_book_path.decode(lazylibrarian.SYS_ENCODING, 'replace') )
                if bookAuthor not in new_authors:
                    new_authors.append(bookAuthor)

    # guess this was meant to save repeat-scans of the same directory
    # if it contains multiple formats of the same book, but there was no code
    # that looked at the array. renamed from latest to processed to make
    # purpose clearer
    processed_subdirectories = []

    matchString = ''
    for char in lazylibrarian.EBOOK_DEST_FILE:
        matchString = matchString + '\\' + char
    # massage the EBOOK_DEST_FILE config parameter into something we can use
    # with regular expression matching
    booktypes = ''
    count = -1
    booktype_list = formatter.getList(lazylibrarian.EBOOK_TYPE)
    for book_type in booktype_list:
        count += 1
        if count == 0:
            booktypes = book_type
        else:
            booktypes = booktypes + '|' + book_type
    matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace(
        "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']'
    pattern = re.compile(matchString, re.VERBOSE)

    for r, d, f in os.walk(dir):
        for directory in d[:]:
            if directory.startswith("."):
                d.remove(directory)
            # prevent magazine being scanned
            if directory.startswith("_"):
                d.remove(directory)
        for files in f:
            file_count += 1
            subdirectory = r.replace(dir, '')
            # Added new code to skip if we've done this directory before. Made this conditional with a switch in config.ini
            # in case user keeps multiple different books in the same
            # subdirectory
            if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories):
                logger.debug("[%s] already scanned" % subdirectory)
            else:
# 			If this is a book, try to get author/title/isbn/language
# 			If metadata.opf exists, use that
# 			else if epub or mobi, read metadata from the book
# 			else have to try pattern match for author/title	and look up isbn/lang from LT or GR late
                match = 0
                extn = ""
                
                if '.' in files:
                    words = files.split('.')
                    extn = words[len(words) - 1]
                    
                if formatter.is_valid_booktype(files):
                    logger.debug(
                        "[%s] Now scanning subdirectory %s" %
                        (dir.decode(lazylibrarian.SYS_ENCODING, 'replace'), subdirectory.decode(lazylibrarian.SYS_ENCODING, 'replace')))
                    
                    # calibre uses "metadata.opf", LL uses "bookname - authorname.opf"
                    # just look for any .opf file in the current directory since we don't know 
                    # LL preferred authorname/bookname at this point
                    metafile = opf_file(r)
                    try:
                        res = get_book_info(metafile)
                    except:
                        res = {}
                    if 'title' in res and 'creator' in res:  # this is the minimum we need
                        book = res['title']
                        author = res['creator']
                        if 'language' in res:
                            language = res['language']
                        else:
                            language = ""
                        if 'identifier' in res:
                            isbn = res['identifier']
                        else:
                            isbn = ""
                        match = 1
                        logger.debug(
                            "file meta [%s] [%s] [%s] [%s]" %
                            (isbn, language, author, book))
                    else:
                        logger.debug("File meta incomplete in %s" % metafile)

                    if not match:
                        # it's a book, but no external metadata found
                        # if it's an epub or a mobi we can try to read metadata
                        # from it
                        if (extn == "epub") or (extn == "mobi"):
                            book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING)
                            try:
                                res = get_book_info(book_filename)
                            except:
                                res = {}
                            if 'title' in res and 'creator' in res:  # this is the minimum we need
                                book = res['title']
                                author = res['creator']
                                if 'language' in res:
                                    language = res['language']
                                else:
                                    language = ""
                                if 'identifier' in res:
                                    isbn = res['identifier']
                                else:
                                    isbn = ""
                                logger.debug("book meta [%s] [%s] [%s] [%s]" %
                                    (isbn, language, author, book))
                                match = 1
                            else:
                                logger.debug("Book meta incomplete in %s" % book_filename)

                if not match:
                    match = pattern.match(files)
                    if match:
                        author = match.group("author")
                        book = match.group("book")
                    else:
                        logger.debug("Pattern match failed [%s]" % files)

                if match:
                    processed_subdirectories.append(
                        subdirectory)  # flag that we found a book in this subdirectory
                    #
                    # If we have a valid looking isbn, and language != "Unknown", add it to cache
                    #
                    if not language:
                        language = "Unknown"

                    if not formatter.is_valid_isbn(isbn):
                        isbn = ""
                    if isbn != "" and language != "Unknown":
                        logger.debug(
                            "Found Language [%s] ISBN [%s]" %
                            (language, isbn))
                        # we need to add it to language cache if not already
                        # there, is_valid_isbn has checked length is 10 or 13
                        if len(isbn) == 10:
                            isbnhead = isbn[0:3]
                        else:
                            isbnhead = isbn[3:6]
                        match = myDB.action(
                            'SELECT lang FROM languages where isbn = "%s"' %
                            (isbnhead)).fetchone()
                        if not match:
                            myDB.action(
                                'insert into languages values ("%s", "%s")' %
                                (isbnhead, language))
                            logger.debug(
                                "Cached Lang [%s] ISBN [%s]" %
                                (language, isbnhead))
                        else:
                            logger.debug(
                                "Already cached Lang [%s] ISBN [%s]" %
                                (language, isbnhead))

                    # get authors name in a consistent format
                    if "," in author:  # "surname, forename"
                        words = author.split(',')
                        author = words[1].strip() + ' ' + words[0].strip()  # "forename surname"
                    if author[1] == ' ':        
                        author = author.replace(' ', '.')
                        author = author.replace('..', '.')

                    # Check if the author exists, and import the author if not,
                    # before starting any complicated book-name matching to save repeating the search
                    #
                    check_exist_author = myDB.action(
                        'SELECT * FROM authors where AuthorName="%s"' %
                        author).fetchone()
                    if not check_exist_author and lazylibrarian.ADD_AUTHOR:
                        # no match for supplied author, but we're allowed to
                        # add new ones

                        GR = GoodReads(author)
                        try:
                            author_gr = GR.find_author_id()
                        except:
                            logger.warn(
                                "Error finding author id for [%s]" %
                                author)
                            continue

                        # only try to add if GR data matches found author data
                        # not sure what this is for, never seems to fail??
                        if author_gr:
                            authorname = author_gr['authorname']

                            # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
                            match_auth = author.replace('.', '_')
                            match_auth = match_auth.replace(' ', '_')
                            match_auth = match_auth.replace('__', '_')
                            match_name = authorname.replace('.', '_')
                            match_name = match_name.replace(' ', '_')
                            match_name = match_name.replace('__', '_')
                            match_name = common.remove_accents(match_name)
                            match_auth = common.remove_accents(match_auth)
                            # allow a degree of fuzziness to cater for different accented character handling.
                            # some author names have accents,
                            # filename may have the accented or un-accented version of the character
                            # The currently non-configurable value of fuzziness might need to go in config
                            # We stored GoodReads unmodified author name in
                            # author_gr, so store in LL db under that
                            match_fuzz = fuzz.ratio(match_auth, match_name)
                            if match_fuzz < 90:
                                logger.debug(
                                    "Failed to match author [%s] fuzz [%d]" %
                                    (author, match_fuzz))
                                logger.debug(
                                    "Failed to match author [%s] to authorname [%s]" %
                                    (match_auth, match_name))

                            # To save loading hundreds of books by unknown
                            # authors at GR or GB, ignore if author "Unknown"
                            if (author != "Unknown") and (match_fuzz >= 90):
                                # use "intact" name for author that we stored in
                                # GR author_dict, not one of the various mangled versions
                                # otherwise the books appear to be by a
                                # different author!
                                author = author_gr['authorname']
                                # this new authorname may already be in the
                                # database, so check again
                                check_exist_author = myDB.action(
                                    'SELECT * FROM authors where AuthorName="%s"' %
                                    author).fetchone()
                                if not check_exist_author:
                                    logger.debug(
                                        "Adding new author [%s]" %
                                        author)
                                    if author not in new_authors:
                                        new_authors.append(author)
                                    try:
                                        importer.addAuthorToDB(author)
                                        check_exist_author = myDB.action(
                                            'SELECT * FROM authors where AuthorName="%s"' %
                                            author).fetchone()
                                    except:
                                        continue

                    # check author exists in db, either newly loaded or already
                    # there
                    if not check_exist_author:
                        logger.debug(
                            "Failed to match author [%s] in database" %
                            author)
                    else:
                        # author exists, check if this book by this author is in our database
                        # metadata might have quotes in book name
                        book = book.replace('"', '').replace("'", "")
                        bookid = find_book_in_db(myDB, author, book)
                        if bookid:
                            # check if book is already marked as "Open" (if so,
                            # we already had it)
                            check_status = myDB.action(
                                'SELECT Status from books where BookID="%s"' %
                                bookid).fetchone()
                            if check_status['Status'] != 'Open':
                                # update status as we've got this book
                                myDB.action(
                                    'UPDATE books set Status="Open" where BookID="%s"' %
                                    bookid)
                                book_filename = os.path.join(
                                    r,
                                    files).encode(
                                        lazylibrarian.SYS_ENCODING)
                                # update book location so we can check if it
                                # gets removed, or allow click-to-open
                                myDB.action(
                                    'UPDATE books set BookFile="%s" where BookID="%s"' %
                                    (book_filename, bookid))
                                new_book_count += 1

    cachesize = myDB.action("select count(*) from languages").fetchone()
    logger.info(
        "%s new/modified books found and added to the database" %
        new_book_count)
    logger.info("%s files processed" % file_count)
    if new_book_count:
        stats = myDB.action(
            "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats").fetchone()
        if lazylibrarian.BOOK_API == "GoogleBooks":
            logger.debug(
                "GoogleBooks was hit %s times for books" %
                stats['sum(GR_book_hits)'])
            logger.debug(
                "GoogleBooks language was changed %s times" %
                stats['sum(GB_lang_change)'])
        if lazylibrarian.BOOK_API == "GoodReads":
            logger.debug(
                "GoodReads was hit %s times for books" %
                stats['sum(GR_book_hits)'])
            logger.debug(
                "GoodReads was hit %s times for languages" %
                stats['sum(GR_lang_hits)'])
        logger.debug(
            "LibraryThing was hit %s times for languages" %
            stats['sum(LT_lang_hits)'])
        logger.debug(
            "Language cache was hit %s times" %
            stats['sum(cache_hits)'])
        logger.debug(
            "Unwanted language removed %s books" %
            stats['sum(bad_lang)'])
        logger.debug(
            "Unwanted characters removed %s books" %
            stats['sum(bad_char)'])
        logger.debug(
            "Unable to cache %s books with missing ISBN" %
            stats['sum(uncached)'])
    logger.debug("ISBN Language cache holds %s entries" % cachesize['count(*)'])
    stats = len(
        myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"'))
    if stats:
        logger.warn(
            "There are %s books in your library with unknown language" %
            stats)

    logger.debug('Updating %i authors' % len(new_authors))
    for auth in new_authors:
        havebooks = len(
            myDB.select('select BookName from Books where status="%s" and AuthorName="%s"' %
                        ('Open', auth)))
        myDB.action(
            'UPDATE authors set HaveBooks="%s" where AuthorName="%s"' %
            (havebooks, auth))
        totalbooks = len(
            myDB.select('select BookName from Books where status!="%s" and AuthorName="%s"' %
                        ('Ignored', auth)))
        myDB.action(
            'UPDATE authors set UnignoredBooks="%s" where AuthorName="%s"' %
            (totalbooks, auth))

    logger.info('Library scan complete')

예제 #8

파일 보기

파일: librarysync.py 프로젝트: Trixter69/LazyLibrarian

def find_book_in_db(myDB, author, book):
# PAB fuzzy search for book in library, return LL bookid if found or zero
# if not, return bookid to more easily update status
    # prefer an exact match on author & book
    match = myDB.action(
        'SELECT BookID FROM books where AuthorName="%s" and BookName="%s"' %
        (author, book)).fetchone()
    if match:
        logger.debug('Exact match [%s]' % book)
        return match['BookID']
    else:
        # No exact match
        # Try a more complex fuzzy match against each book in the db by this author
        # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>65)
        # These are results that work well on my library, minimal false matches and no misses on books that should be matched
        # Maybe make ratios configurable in config.ini later
#
        books = myDB.select(
            'SELECT BookID,BookName FROM books where AuthorName="%s"' %
            author)
        best_ratio = 0
        best_partial = 0
        ratio_name = ""
        partial_name = ""
        ratio_id = 0
        partial_id = 0
        logger.debug("Found %s books for %s" % (len(books), author))
        for a_book in books:
            # tidy up everything to raise fuzziness scores
            book_lower = common.remove_accents(book.lower())
            a_book_lower = common.remove_accents(a_book['BookName'].lower())
            #
            ratio = fuzz.ratio(book_lower, a_book_lower)
            partial = fuzz.partial_ratio(book_lower, a_book_lower)
            if ratio > best_ratio:
                best_ratio = ratio
                ratio_name = a_book['BookName']
                ratio_id = a_book['BookID']
            if partial > best_partial:
                best_partial = partial
                partial_name = a_book['BookName']
                partial_id = a_book['BookID']

            else:
                if partial == best_partial:
                    # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest
                    # this eliminates most false matches against omnibuses
                    if a_book_lower.find(book_lower) < partial_name.lower().find(book_lower):
                        logger.debug(
                            "Fuzz left prefer [%s] over [%s]" %
                            (a_book['BookName'], partial_name))
                        best_partial = partial
                        partial_name = a_book['BookName']
                        partial_id = a_book['BookID']
            #
        if best_ratio > 90:
            logger.debug(
                "Fuzz match   ratio [%d] [%s] [%s]" %
                (best_ratio, book, ratio_name))
            return ratio_id
        if best_partial > 65:
            logger.debug(
                "Fuzz match partial [%d] [%s] [%s]" %
                (best_partial, book, partial_name))
            return partial_id

        logger.debug(
            'Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s]' %
            (author, book, best_ratio, ratio_name, best_partial, partial_name))
        return 0

예제 #9

파일 보기

파일: searchmag.py 프로젝트: andrzejc/LazyLibrarian

def search_magazines(mags=None, reset=False):
    # produce a list of magazines to search for, tor, nzb, torznab

    myDB = database.DBConnection()
    searchlist = []
    threading.currentThread().name = "SEARCHMAGS"

    if mags is None:  # backlog search
        searchmags = myDB.select('SELECT Title, LastAcquired, \
                                 IssueDate from magazines WHERE Status="Active"')
    else:
        searchmags = []
        for magazine in mags:
            searchmags_temp = myDB.select('SELECT Title, LastAcquired, IssueDate from magazines \
                                          WHERE Title="%s" AND Status="Active"' % (magazine['bookid']))
            for terms in searchmags_temp:
                searchmags.append(terms)

    if len(searchmags) == 1:
        logger.info('Searching for one magazine')
    else:
        logger.info('Searching for %i magazines' % len(searchmags))

    for searchmag in searchmags:
        bookid = searchmag[0]
        searchterm = searchmag[0]
        # frequency = searchmag[1]
        # last_acquired = searchmag[2]
        # issue_date = searchmag[3]

        dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''}

        searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic))
        searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8')
        searchlist.append({"bookid": bookid, "searchterm": searchterm})

    if searchlist == []:
        logger.warn('There is nothing to search for.  Mark some magazines as active.')

    for book in searchlist:

        resultlist = []
        tor_resultlist = []
        if lazylibrarian.USE_NZB():
            resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag')
            if not nproviders:
                logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers')

        if lazylibrarian.USE_TOR():
            tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag')
            if not nproviders:
                logger.warn('No torrent providers are set. Check config for TORRENT providers')

            for item in tor_resultlist:  # reformat the torrent results so they look like nzbs
                resultlist.append({
                    'bookid': item['bookid'],
                    'nzbprov': item['tor_prov'],
                    'nzbtitle': item['tor_title'],
                    'nzburl': item['tor_url'],
                    'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100',  # fake date as none returned from torrents
                    'nzbsize': item['tor_size'],
                    'nzbmode': 'torrent'
                })

        if not resultlist:
            logger.debug("Adding magazine %s to queue." % book['searchterm'])

        else:
            bad_regex = 0
            bad_date = 0
            old_date = 0
            total_nzbs = 0
            new_date = 0
            to_snatch = 0
            maglist = []
            issues = []
            reject_list = formatter.getList(lazylibrarian.REJECT_WORDS)
            for nzb in resultlist:
                total_nzbs = total_nzbs + 1
                bookid = nzb['bookid']
                nzbtitle = (u'%s' % nzb['nzbtitle'])
                nzbtitle = nzbtitle.replace('"', '').replace("'", "")  # suppress " in titles
                nzburl = nzb['nzburl']
                nzbprov = nzb['nzbprov']
                nzbdate_temp = nzb['nzbdate']
                nzbsize_temp = nzb['nzbsize']
                if nzbsize_temp is None:  # not all torrents returned by torznab have a size
                    nzbsize_temp = 1000
                nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB'
                nzbdate = formatter.nzbdate2format(nzbdate_temp)
                nzbmode = nzb['nzbmode']
                checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid)
                if checkifmag:
                    for results in checkifmag:
                        control_date = results['IssueDate']
                        # frequency = results['Frequency']
                        # regex = results['Regex']

                    nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace(
                        '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip()
                    # Need to make sure that substrings of magazine titles don't get found
                    # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this
                    # keyword_check = nzbtitle_formatted.replace(bookid, '')
                    # remove extra spaces if they're in a row
                    nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split())
                    nzbtitle_exploded = nzbtitle_exploded_temp.split(' ')

                    if ' ' in bookid:
                        bookid_exploded = bookid.split(' ')
                    else:
                        bookid_exploded = [bookid]

                    # check nzb starts with magazine title, and ends with a date
                    # eg The MagPI Issue 22 - July 2015
                    # do something like check left n words match title
                    # then check last n words are a date

                    name_match = 1  # assume name matches for now
                    if len(nzbtitle_exploded) > len(bookid_exploded):  # needs to be longer as it has to include a date
                        # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz
                        mag_title_match = fuzz.token_set_ratio(common.remove_accents(bookid), common.remove_accents(nzbtitle_formatted))
                        if mag_title_match < lazylibrarian.MATCH_RATIO:
                            logger.debug(u"Magazine token set Match failed: " + str(mag_title_match) + "% for " + nzbtitle_formatted)
                            name_match = 0
                    
                    lower_title = common.remove_accents(nzbtitle_formatted).lower()
                    lower_bookid = common.remove_accents(bookid).lower()
                    for word in reject_list:
                        if word in lower_title and not word in lower_bookid:
                            name_match = 0
                            logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word))
                            break

                    if name_match:
                        # some magazine torrent uploaders add their sig in [] or {}
                        # Fortunately for us, they always seem to add it at the end
                        # also some magazine torrent titles are "magazine_name some_form_of_date pdf"
                        # so strip all the trailing junk...
                        while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \
                                nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf':
                                nzbtitle_exploded.pop()  # gotta love the function names
                        
                        # need at least one word magazine title and two date components
                        if len(nzbtitle_exploded) > 2:
                            # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn MonthName YYYY
                            regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1]
                            regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2]
                            regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp))
                            if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100:
                                regexA_year = 'fail'  # force date failure
                            
                            #if frequency == "Weekly" or frequency == "BiWeekly":
                            regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2)
                            if regexA_day.isdigit():
                                if int(regexA_day) > 31:  # probably issue number nn
                                    regexA_day = '01'
                            else:
                                regexA_day = '01'  # just MonthName YYYY
                            #else:
                            #    regexA_day = '01'  # monthly, or less frequent

                            try:
                                newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day
                                # try to make sure the year/month/day are valid, exception if not
                                # ie don't accept day > 31, or 30 in some months
                                # also handles multiple date format named issues eg Jan 2014, 01 2014
                                # datetime will give a ValueError if not a good date or a param is not int
                                date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day))
                            except ValueError:
                                # regexB = MonthName DD YYYY
                                regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1]
                                regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3]
                                regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp))
                                regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2)
                                if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100:
                                    regexB_year = 'fail'

                                try:
                                    newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day
                                    # datetime will give a ValueError if not a good date or a param is not int
                                    date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day))
                                except ValueError:
                                    # regexC = YYYY MM or YYYY MM DD
                                    # (can't get MM/DD if named YYYY Issue nn)
                                    # First try  YYYY MM
                                    regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2]
                                    if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100:
                                        regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2)
                                        regexC_day = '01'
                                    else:  # try YYYY MM DD
                                        regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3]
                                        if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100:
                                            regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2)
                                            regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2)
                                        else:
                                            regexC_year = 'fail'
                                    try:
                                        newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day
                                        # datetime will give a ValueError if not a good date or a param is not int
                                        date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day))
                                    except:
                                        logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted)
                                        bad_date = bad_date + 1
                                        # allow issues with good name but bad date to be included
                                        # so user can manually select them, incl those with issue numbers
                                        newdatish = "1970-01-01"  # provide a fake date for bad-date issues
                                        # continue
                        else:
                            continue

                        if control_date is None:  # we haven't got any copies of this magazine yet
                            # get a rough time just over a month ago to compare to, in format yyyy-mm-dd
                            # could perhaps calc differently for weekly, biweekly etc
                            start_time = time.time()
                            start_time -= 31 * 24 * 60 * 60  # number of seconds in 31 days
                            control_date = time.strftime("%Y-%m-%d", time.localtime(start_time))

                        # only grab a copy if it's newer than the most recent we have,
                        # or newer than a month ago if we have none
                        comp_date = formatter.datecompare(newdatish, control_date)
                        if comp_date > 0:
                            # Should probably only upsert when downloaded and processed in case snatch fails
                            # keep track of what we're going to download so we don't download dupes
                            new_date = new_date + 1
                            issue = bookid + ',' + newdatish
                            if issue not in issues:
                                maglist.append({
                                    'bookid': bookid,
                                    'nzbprov': nzbprov,
                                    'nzbtitle': nzbtitle,
                                    'nzburl': nzburl,
                                    'nzbmode': nzbmode
                                })
                                logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted)
                                to_snatch = to_snatch + 1
                                issues.append(issue)
                                
                                controlValueDict = {"NZBurl": nzburl}
                                newValueDict = {
                                    "NZBprov": nzbprov,
                                    "BookID": bookid,
                                    "NZBdate": formatter.now(),  # when we asked for it
                                    "NZBtitle": nzbtitle,
                                    "AuxInfo": newdatish,
                                    "Status": "Wanted",
                                    "NZBsize": nzbsize,
                                    "NZBmode": nzbmode
                                }
                                myDB.upsert("wanted", newValueDict, controlValueDict)
                                
                            else:
                                logger.debug('This issue of %s is already flagged for download' % issue)
                        else:
                            if newdatish != "1970-01-01":  # this is our fake date for ones we can't decipher
                                logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted)
                                old_date = old_date + 1
                    else:
                        logger.debug('Magazine [%s] does not completely match search term [%s].' % (
                                     nzbtitle_formatted, bookid))
                        bad_regex = bad_regex + 1

            logger.info('Found %s results for %s.  %s are new, %s are old, %s fail date, %s fail name matching' % (
                        total_nzbs, bookid, new_date, old_date, bad_date, bad_regex))
            logger.info("%s, %s issues to download" % (bookid, to_snatch))

            for items in maglist:
                if items['nzbmode'] == "torznab":
                    snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                elif items['nzbmode'] == "torrent":
                    snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                else:
                    snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                if snatch:
                    notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now())
                    common.schedule_job(action='Start', target='processDir')
            maglist = []

    if reset == True:
        common.schedule_job(action='Restart', target='search_magazines')
        
    logger.info("Search for magazines complete")

예제 #10

파일 보기

파일: librarysync.py 프로젝트: NVRemoteDev/LazyLibrarian

def LibraryScan(dir=None):
    if not dir:
        if not lazylibrarian.DOWNLOAD_DIR:
            return
        else:
            dir = lazylibrarian.DOWNLOAD_DIR

    if not os.path.isdir(dir):
        logger.warn(
            'Cannot find directory: %s. Not scanning' %
            dir.decode(lazylibrarian.SYS_ENCODING, 'replace'))
        return

    myDB = database.DBConnection()

    myDB.action('drop table if exists stats')
    myDB.action(
        'create table stats (authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, \
                            GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )')

    logger.info(
        'Scanning ebook directory: %s' %
        dir.decode(lazylibrarian.SYS_ENCODING, 'replace'))

    new_book_count = 0
    file_count = 0

    if lazylibrarian.FULL_SCAN:
        books = myDB.select(
            'select AuthorName, BookName, BookFile, BookID from books where Status="Open"')
        status = lazylibrarian.NOTFOUND_STATUS
        logger.info('Missing books will be marked as %s' % status)
        for book in books:
            bookName = book['BookName']
            bookAuthor = book['AuthorName']
            bookID = book['BookID']
            bookfile = book['BookFile']

            if not(bookfile and os.path.isfile(bookfile)):
                myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID))
                myDB.action('update books set BookFile="" where BookID="%s"' % bookID)
                logger.warn('Book %s - %s updated as not found on disk' % (bookAuthor, bookName))
        
    # to save repeat-scans of the same directory if it contains multiple formats of the same book, 
    # keep track of which directories we've already looked at 
    processed_subdirectories = []

    matchString = ''
    for char in lazylibrarian.EBOOK_DEST_FILE:
        matchString = matchString + '\\' + char
    # massage the EBOOK_DEST_FILE config parameter into something we can use
    # with regular expression matching
    booktypes = ''
    count = -1
    booktype_list = formatter.getList(lazylibrarian.EBOOK_TYPE)
    for book_type in booktype_list:
        count += 1
        if count == 0:
            booktypes = book_type
        else:
            booktypes = booktypes + '|' + book_type
    matchString = matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace(
        "\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)") + '\.[' + booktypes + ']'
    pattern = re.compile(matchString, re.VERBOSE)

    for r, d, f in os.walk(dir):
        for directory in d[:]:
            if directory.startswith("."):
                d.remove(directory)
            # prevent magazine being scanned
            if directory.startswith("_"):
                d.remove(directory)

        for files in f:
            file_count += 1

            if isinstance(r, str):
                r = r.decode('utf-8')

            subdirectory = r.replace(dir, '')
            # Added new code to skip if we've done this directory before.
            # Made this conditional with a switch in config.ini
            # in case user keeps multiple different books in the same subdirectory
            if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories):
                logger.debug("[%s] already scanned" % subdirectory)
            else:
                # If this is a book, try to get author/title/isbn/language
                # if epub or mobi, read metadata from the book
                # If metadata.opf exists, use that allowing it to override
                # embedded metadata. User may have edited metadata.opf
                # to merge author aliases together
                # If all else fails, try pattern match for author/title
                # and look up isbn/lang from LT or GR later
                match = 0
                if formatter.is_valid_booktype(files):

                    logger.debug("[%s] Now scanning subdirectory %s" %
                                 (dir, subdirectory))

                    language = "Unknown"
                    isbn = ""
                    book = ""
                    author = ""
                    words = files.split('.')
                    extn = words[len(words) - 1]

                    # if it's an epub or a mobi we can try to read metadata from it
                    if (extn == "epub") or (extn == "mobi"):
                        book_filename = os.path.join(
                            r.encode(lazylibrarian.SYS_ENCODING), files.encode(lazylibrarian.SYS_ENCODING))

                        try:
                            res = get_book_info(book_filename)
                        except:
                            res = {}
                        if 'title' in res and 'creator' in res:  # this is the minimum we need
                            match = 1
                            book = res['title']
                            author = res['creator']
                            if 'language' in res:
                                language = res['language']
                            if 'identifier' in res:
                                isbn = res['identifier']
                            if 'type' in res:
                                extn = res['type']

                            logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" %
                                         (isbn, language, author, book, extn))
                        else:

                            logger.debug("Book meta incomplete in %s" % book_filename)

                    # calibre uses "metadata.opf", LL uses "bookname - authorname.opf"
                    # just look for any .opf file in the current directory since we don't know
                    # LL preferred authorname/bookname at this point.
                    # Allow metadata in file to override book contents as may be users pref

                    metafile = opf_file(r)
                    try:
                        res = get_book_info(metafile)
                    except:
                        res = {}
                    if 'title' in res and 'creator' in res:  # this is the minimum we need
                        match = 1
                        book = res['title']
                        author = res['creator']
                        if 'language' in res:
                            language = res['language']
                        if 'identifier' in res:
                            isbn = res['identifier']
                        logger.debug(
                            "file meta [%s] [%s] [%s] [%s]" %
                            (isbn, language, author, book))
                    else:
                        logger.debug("File meta incomplete in %s" % metafile)

                    if not match:  # no author/book from metadata file, and not embedded either
                        match = pattern.match(files)
                        if match:
                            author = match.group("author")
                            book = match.group("book")
                        else:
                            logger.debug("Pattern match failed [%s]" % files)

                    if match:
                        # flag that we found a book in this subdirectory
                        processed_subdirectories.append(subdirectory)

                        # If we have a valid looking isbn, and language != "Unknown", add it to cache
                        if language != "Unknown" and formatter.is_valid_isbn(isbn):
                            logger.debug(
                                "Found Language [%s] ISBN [%s]" %
                                (language, isbn))
                            # we need to add it to language cache if not already
                            # there, is_valid_isbn has checked length is 10 or 13
                            if len(isbn) == 10:
                                isbnhead = isbn[0:3]
                            else:
                                isbnhead = isbn[3:6]
                            match = myDB.action(
                                'SELECT lang FROM languages where isbn = "%s"' %
                                (isbnhead)).fetchone()
                            if not match:
                                myDB.action(
                                    'insert into languages values ("%s", "%s")' %
                                    (isbnhead, language))
                                logger.debug(
                                    "Cached Lang [%s] ISBN [%s]" %
                                    (language, isbnhead))
                            else:
                                logger.debug(
                                    "Already cached Lang [%s] ISBN [%s]" %
                                    (language, isbnhead))

                        # get authors name in a consistent format
                        if "," in author:  # "surname, forename"
                            words = author.split(',')
                            author = words[1].strip() + ' ' + words[0].strip()  # "forename surname"
                        if author[1] == ' ':
                            author = author.replace(' ', '.')
                            author = author.replace('..', '.')

                        # Check if the author exists, and import the author if not,
                        # before starting any complicated book-name matching to save repeating the search
                        #
                        check_exist_author = myDB.action(
                            'SELECT * FROM authors where AuthorName="%s"' %
                            author).fetchone()
                        if not check_exist_author and lazylibrarian.ADD_AUTHOR:
                            # no match for supplied author, but we're allowed to
                            # add new ones

                            GR = GoodReads(author)
                            try:
                                author_gr = GR.find_author_id()
                            except:
                                logger.warn(
                                    "Error finding author id for [%s]" %
                                    author)
                                continue

                            # only try to add if GR data matches found author data
                            if author_gr:
                                authorname = author_gr['authorname']

                                # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
                                match_auth = author.replace('.', '_')
                                match_auth = match_auth.replace(' ', '_')
                                match_auth = match_auth.replace('__', '_')
                                match_name = authorname.replace('.', '_')
                                match_name = match_name.replace(' ', '_')
                                match_name = match_name.replace('__', '_')
                                match_name = common.remove_accents(match_name)
                                match_auth = common.remove_accents(match_auth)
                                # allow a degree of fuzziness to cater for different accented character handling.
                                # some author names have accents,
                                # filename may have the accented or un-accented version of the character
                                # The currently non-configurable value of fuzziness might need to go in config
                                # We stored GoodReads unmodified author name in
                                # author_gr, so store in LL db under that
                                # fuzz.ratio doesn't lowercase for us
                                match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower())
                                if match_fuzz < 90:
                                    logger.debug(
                                        "Failed to match author [%s] fuzz [%d]" %
                                        (author, match_fuzz))
                                    logger.debug(
                                        "Failed to match author [%s] to authorname [%s]" %
                                        (match_auth, match_name))

                                # To save loading hundreds of books by unknown
                                # authors at GR or GB, ignore if author "Unknown"
                                if (author != "Unknown") and (match_fuzz >= 90):
                                    # use "intact" name for author that we stored in
                                    # GR author_dict, not one of the various mangled versions
                                    # otherwise the books appear to be by a different author!
                                    author = author_gr['authorname']
                                    # this new authorname may already be in the
                                    # database, so check again
                                    check_exist_author = myDB.action(
                                        'SELECT * FROM authors where AuthorName="%s"' %
                                        author).fetchone()
                                    if not check_exist_author:
                                        logger.debug(
                                            "Adding new author [%s]" %
                                            author)
                                        try:
                                            importer.addAuthorToDB(author)
                                            check_exist_author = myDB.action(
                                                'SELECT * FROM authors where AuthorName="%s"' %
                                                author).fetchone()
                                        except:
                                            continue

                        # check author exists in db, either newly loaded or already there
                        if not check_exist_author:
                            logger.debug(
                                "Failed to match author [%s] in database" %
                                author)
                        else:
                            # author exists, check if this book by this author is in our database
                            # metadata might have quotes in book name
                            book = book.replace('"', '').replace("'", "")
                            bookid = find_book_in_db(myDB, author, book)

                            if bookid:
                                # check if book is already marked as "Open" (if so,
                                # we already had it)

                                check_status = myDB.action(
                                    'SELECT Status from books where BookID="%s"' %
                                    bookid).fetchone()
                                if check_status['Status'] != 'Open':
                                    # update status as we've got this book

                                    myDB.action(
                                        'UPDATE books set Status="Open" where BookID="%s"' %
                                        bookid)

                                    book_filename = os.path.join(r, files)

                                    # update book location so we can check if it
                                    # gets removed, or allow click-to-open

                                    myDB.action(
                                        'UPDATE books set BookFile="%s" where BookID="%s"' %
                                        (book_filename, bookid))

                                    new_book_count += 1

    cachesize = myDB.action("select count('ISBN') as counter from languages").fetchone()
    logger.info(
        "%s new/modified books found and added to the database" %
        new_book_count)
    logger.info("%s files processed" % file_count)
    stats = myDB.action(
        "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \
            sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats").fetchone()
    if stats['sum(GR_book_hits)'] is not None:
        # only show stats if new books added
        if lazylibrarian.BOOK_API == "GoogleBooks":
            logger.debug(
                "GoogleBooks was hit %s times for books" %
                stats['sum(GR_book_hits)'])
            logger.debug(
                "GoogleBooks language was changed %s times" %
                stats['sum(GB_lang_change)'])
        if lazylibrarian.BOOK_API == "GoodReads":
            logger.debug(
                "GoodReads was hit %s times for books" %
                stats['sum(GR_book_hits)'])
            logger.debug(
                "GoodReads was hit %s times for languages" %
                stats['sum(GR_lang_hits)'])
        logger.debug(
            "LibraryThing was hit %s times for languages" %
            stats['sum(LT_lang_hits)'])
        logger.debug(
            "Language cache was hit %s times" %
            stats['sum(cache_hits)'])
        logger.debug(
            "Unwanted language removed %s books" %
            stats['sum(bad_lang)'])
        logger.debug(
            "Unwanted characters removed %s books" %
            stats['sum(bad_char)'])
        logger.debug(
            "Unable to cache %s books with missing ISBN" %
            stats['sum(uncached)'])
    logger.debug("Cache %s hits, %s miss" % (lazylibrarian.CACHE_HIT, lazylibrarian.CACHE_MISS))
    logger.debug("ISBN Language cache holds %s entries" % cachesize['counter'])
    stats = len(myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"'))
    if stats:
        logger.warn("There are %s books in your library with unknown language" % stats)

    authors = myDB.select('select AuthorName from authors')
    # Update bookcounts for all authors, not just new ones - refresh may have located
    # new books for existing authors especially if switched provider gb/gr
    logger.debug('Updating bookcounts for %i authors' % len(authors))
    for author in authors:
        name = author['AuthorName']
        havebooks = myDB.action(
            'SELECT count("BookID") as counter from books WHERE AuthorName="%s" AND (Status="Have" OR Status="Open")' %
            name).fetchone()
        myDB.action('UPDATE authors set HaveBooks="%s" where AuthorName="%s"' % (havebooks['counter'], name))
        totalbooks = myDB.action(
            'SELECT count("BookID") as counter FROM books WHERE AuthorName="%s"' % name).fetchone()        
        myDB.action('UPDATE authors set TotalBooks="%s" where AuthorName="%s"' % (totalbooks['counter'], name))
        unignoredbooks = myDB.action(
            'SELECT count("BookID") as counter FROM books WHERE AuthorName="%s" AND Status!="Ignored"' %
            name).fetchone()
        myDB.action('UPDATE authors set UnignoredBooks="%s" where AuthorName="%s"' % (unignoredbooks['counter'], name))

    covers = myDB.action("select  count('bookimg') as counter from books where bookimg like 'http%'").fetchone()
    logger.info("Caching covers for %s books" % covers['counter'])

    images = myDB.action('select bookid, bookimg, bookname from books where bookimg like "http%"')
    for item in images:
        bookid = item['bookid']
        bookimg = item['bookimg']
        bookname = item['bookname']
        newimg = bookwork.cache_cover(bookid, bookimg)
        if newimg != bookimg:
            myDB.action('update books set BookImg="%s" where BookID="%s"' % (newimg, bookid))
    logger.info('Library scan complete')

예제 #11

파일 보기

파일: librarysync.py 프로젝트: NVRemoteDev/LazyLibrarian

def find_book_in_db(myDB, author, book):
    # PAB fuzzy search for book in library, return LL bookid if found or zero
    # if not, return bookid to more easily update status
    # prefer an exact match on author & book
    match = myDB.action(
        'SELECT BookID FROM books where AuthorName="%s" and BookName="%s"' %
        (author, book)).fetchone()
    if match:
        logger.debug('Exact match [%s]' % book)
        return match['BookID']
    else:
        # No exact match
        # Try a more complex fuzzy match against each book in the db by this author
        # Using hard-coded ratios for now, ratio high (>90), partial_ratio lower (>65)
        # These are results that work well on my library, minimal false matches and no misses
        # on books that should be matched
        # Maybe make ratios configurable in config.ini later

        books = myDB.select('SELECT BookID,BookName FROM books where AuthorName="%s"' % author)
        best_ratio = 0
        best_partial = 0
        ratio_name = ""
        partial_name = ""
        ratio_id = 0
        partial_id = 0
        #logger.debug("Found %s books for %s" % (len(books), author))
        for a_book in books:
            # tidy up everything to raise fuzziness scores
            # still need to lowercase for matching against partial_name later on
            book_lower = common.remove_accents(book.lower())
            a_book_lower = common.remove_accents(a_book['BookName'].lower())
            #
            ratio = fuzz.ratio(book_lower, a_book_lower)
            partial = fuzz.partial_ratio(book_lower, a_book_lower)
            if ratio > best_ratio:
                best_ratio = ratio
                ratio_name = a_book['BookName']
                ratio_id = a_book['BookID']
            if partial > best_partial:
                best_partial = partial
                partial_name = a_book['BookName']
                partial_id = a_book['BookID']

            else:
                if partial == best_partial:
                    # prefer the match closest to the left, ie prefer starting with a match and ignoring the rest
                    # this eliminates most false matches against omnibuses
                    if a_book_lower.find(book_lower) < partial_name.lower().find(book_lower):
                        logger.debug(
                            "Fuzz left prefer [%s] over [%s]" %
                            (a_book['BookName'], partial_name))
                        best_partial = partial
                        partial_name = a_book['BookName']
                        partial_id = a_book['BookID']
            #
        if best_ratio > 90:
            logger.debug(
                "Fuzz match   ratio [%d] [%s] [%s]" %
                (best_ratio, book, ratio_name))
            return ratio_id
        if best_partial > 65:
            logger.debug(
                "Fuzz match partial [%d] [%s] [%s]" %
                (best_partial, book, partial_name))
            return partial_id

        logger.debug(
            'Fuzz failed [%s - %s] ratio [%d,%s], partial [%d,%s]' %
            (author, book, best_ratio, ratio_name, best_partial, partial_name))
        return 0

예제 #12

파일 보기

def search_magazines(mags=None, reset=False):
    # produce a list of magazines to search for, tor, nzb, torznab

    myDB = database.DBConnection()
    searchlist = []
    threading.currentThread().name = "SEARCHMAGS"

    if mags is None:  # backlog search
        searchmags = myDB.select('SELECT Title, LastAcquired, \
                                 IssueDate from magazines WHERE Status="Active"')
    else:
        searchmags = []
        for magazine in mags:
            searchmags_temp = myDB.select('SELECT Title, LastAcquired, IssueDate from magazines \
                                          WHERE Title="%s" AND Status="Active"' % (magazine['bookid']))
            for terms in searchmags_temp:
                searchmags.append(terms)

    if len(searchmags) == 1:
        logger.info('Searching for one magazine')
    else:
        logger.info('Searching for %i magazines' % len(searchmags))

    for searchmag in searchmags:
        bookid = searchmag[0]
        searchterm = searchmag[0]
        # frequency = searchmag[1]
        # last_acquired = searchmag[2]
        # issue_date = searchmag[3]

        dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''}

        searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic))
        searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8')
        searchlist.append({"bookid": bookid, "searchterm": searchterm})

    if searchlist == []:
        logger.warn('There is nothing to search for.  Mark some magazines as active.')

    for book in searchlist:

        resultlist = []
        tor_resultlist = []
        if lazylibrarian.USE_NZB():
            resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag')
            if not nproviders:
                logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers')

        if lazylibrarian.USE_TOR():
            tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag')
            if not nproviders:
                logger.warn('No torrent providers are set. Check config for TORRENT providers')

            for item in tor_resultlist:  # reformat the torrent results so they look like nzbs
                resultlist.append({
                    'bookid': item['bookid'],
                    'nzbprov': item['tor_prov'],
                    'nzbtitle': item['tor_title'],
                    'nzburl': item['tor_url'],
                    'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100',  # fake date as none returned from torrents
                    'nzbsize': item['tor_size'],
                    'nzbmode': 'torrent'
                })

        if not resultlist:
            logger.debug("Adding magazine %s to queue." % book['searchterm'])

        else:
            bad_regex = 0
            bad_date = 0
            old_date = 0
            total_nzbs = 0
            new_date = 0
            to_snatch = 0
            maglist = []
            issues = []
            reject_list = formatter.getList(lazylibrarian.REJECT_WORDS)
            for nzb in resultlist:
                total_nzbs = total_nzbs + 1
                bookid = nzb['bookid']
                nzbtitle = (u'%s' % nzb['nzbtitle'])
                nzbtitle = nzbtitle.replace('"', '').replace("'", "")  # suppress " in titles
                nzburl = nzb['nzburl']
                nzbprov = nzb['nzbprov']
                nzbdate_temp = nzb['nzbdate']
                nzbsize_temp = nzb['nzbsize']
                if nzbsize_temp is None:  # not all torrents returned by torznab have a size
                    nzbsize_temp = 1000
                nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB'
                nzbdate = formatter.nzbdate2format(nzbdate_temp)
                nzbmode = nzb['nzbmode']

                checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid)
                if checkifmag:
                    for results in checkifmag:
                        control_date = results['IssueDate']
                        # frequency = results['Frequency']
                        # regex = results['Regex']

                    nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace(
                        '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip()
                    # Need to make sure that substrings of magazine titles don't get found
                    # (e.g. Maxim USA will find Maximum PC USA) - token_set_ratio takes care of this
                    # keyword_check = nzbtitle_formatted.replace(bookid, '')
                    # remove extra spaces if they're in a row
                    nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split())
                    nzbtitle_exploded = nzbtitle_exploded_temp.split(' ')

                    if ' ' in bookid:
                        bookid_exploded = bookid.split(' ')
                    else:
                        bookid_exploded = [bookid]

                    # check nzb starts with magazine title, and ends with a date
                    # eg The MagPI Issue 22 - July 2015
                    # do something like check left n words match title
                    # then check last n words are a date

                    name_match = 1  # assume name matches for now
                    if len(nzbtitle_exploded) > len(bookid_exploded):  # needs to be longer as it has to include a date
                        # check (nearly) all the words in the mag title are in the nzbtitle - allow some fuzz
                        mag_title_match = fuzz.token_set_ratio(
                            common.remove_accents(bookid),
                            common.remove_accents(nzbtitle_formatted))
                        if mag_title_match < lazylibrarian.MATCH_RATIO:
                            logger.debug(
                                u"Magazine token set Match failed: " + str(
                                    mag_title_match) + "% for " + nzbtitle_formatted)
                            name_match = 0

                    lower_title = common.remove_accents(nzbtitle_formatted).lower()
                    lower_bookid = common.remove_accents(bookid).lower()
                    for word in reject_list:
                        if word in lower_title and not word in lower_bookid:
                            name_match = 0
                            logger.debug("Rejecting %s, contains %s" % (nzbtitle_formatted, word))
                            break

                    if name_match:
                        # some magazine torrent uploaders add their sig in [] or {}
                        # Fortunately for us, they always seem to add it at the end
                        # also some magazine torrent titles are "magazine_name some_form_of_date pdf"
                        # so strip all the trailing junk...
                        while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \
                                nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf':
                                nzbtitle_exploded.pop()  # gotta love the function names

                        # need at least one word magazine title and two date components
                        if len(nzbtitle_exploded) > 2:
                            # regexA = DD MonthName YYYY OR MonthName YYYY or Issue nn, MonthName YYYY
                            regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1]
                            regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2]
                            regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp))
                            if not regexA_year.isdigit() or int(regexA_year) < 1900 or int(regexA_year) > 2100:
                                regexA_year = 'fail'  # force date failure

                            # if frequency == "Weekly" or frequency == "BiWeekly":
                            regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].rstrip(',').zfill(2)
                            if regexA_day.isdigit():
                                if int(regexA_day) > 31:  # probably issue number nn
                                    regexA_day = '01'
                            else:
                                regexA_day = '01'  # just MonthName YYYY
                            # else:
                            # regexA_day = '01'  # monthly, or less frequent

                            try:
                                newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day
                                # try to make sure the year/month/day are valid, exception if not
                                # ie don't accept day > 31, or 30 in some months
                                # also handles multiple date format named issues eg Jan 2014, 01 2014
                                # datetime will give a ValueError if not a good date or a param is not int
                                date1 = datetime.date(int(regexA_year), int(regexA_month), int(regexA_day))
                            except ValueError:
                                # regexB = MonthName DD YYYY or MonthName DD, YYYY
                                regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1]
                                regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3]
                                regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp))
                                regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].rstrip(',').zfill(2)
                                if not regexB_year.isdigit() or int(regexB_year) < 1900 or int(regexB_year) > 2100:
                                    regexB_year = 'fail'

                                try:
                                    newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day
                                    # datetime will give a ValueError if not a good date or a param is not int
                                    date1 = datetime.date(int(regexB_year), int(regexB_month), int(regexB_day))
                                except ValueError:
                                    # regexC = YYYY MM or YYYY MM DD
                                    # (can't get MM/DD if named YYYY Issue nn)
                                    # First try  YYYY MM
                                    regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 2]
                                    if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100:
                                        regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2)
                                        regexC_day = '01'
                                    else:  # try YYYY MM DD
                                        regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3]
                                        if regexC_year.isdigit() and int(regexC_year) > 1900 and int(regexC_year) < 2100:
                                            regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2)
                                            regexC_day = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2)
                                        else:
                                            regexC_year = 'fail'
                                    try:
                                        newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day
                                        # datetime will give a ValueError if not a good date or a param is not int
                                        date1 = datetime.date(int(regexC_year), int(regexC_month), int(regexC_day))
                                    except:
                                        logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted)
                                        bad_date = bad_date + 1
                                        # allow issues with good name but bad date to be included
                                        # so user can manually select them, incl those with issue numbers
                                        newdatish = "1970-01-01"  # provide a fake date for bad-date issues
                                        # continue
                        else:
                            continue

                        #  store all the _new_ matching results, marking as "skipped" for now
                        #  we change the status to "wanted" on the ones we want to snatch later
                        #  don't add a new entry if this issue has been found on an earlier search
                        #  because status might have been user-set
                        mag_entry = myDB.select('SELECT * from wanted WHERE NZBtitle="%s" and NZBprov="%s"' % (nzbtitle, nzbprov))
                        if not mag_entry:
                            controlValueDict = {
                                "NZBtitle": nzbtitle,
                                "NZBprov": nzbprov
                            }
                            newValueDict = {
                                "NZBurl": nzburl,
                                "BookID": bookid,
                                "NZBdate": nzbdate,
                                "AuxInfo": newdatish,
                                "Status": "Skipped",
                                "NZBsize": nzbsize,
                                "NZBmode": nzbmode
                            }
                            myDB.upsert("wanted", newValueDict, controlValueDict)

                        if control_date is None:  # we haven't got any copies of this magazine yet
                            # get a rough time just over a month ago to compare to, in format yyyy-mm-dd
                            # could perhaps calc differently for weekly, biweekly etc
                            start_time = time.time()
                            start_time -= 31 * 24 * 60 * 60  # number of seconds in 31 days
                            control_date = time.strftime("%Y-%m-%d", time.localtime(start_time))

                        # only grab a copy if it's newer than the most recent we have,
                        # or newer than a month ago if we have none
                        comp_date = formatter.datecompare(newdatish, control_date)
                        if comp_date > 0:
                            # Should probably only upsert when downloaded and processed in case snatch fails
                            # keep track of what we're going to download so we don't download dupes
                            new_date = new_date + 1
                            issue = bookid + ',' + newdatish
                            if issue not in issues:
                                maglist.append({
                                    'bookid': bookid,
                                    'nzbprov': nzbprov,
                                    'nzbtitle': nzbtitle,
                                    'nzburl': nzburl,
                                    'nzbmode': nzbmode
                                })
                                logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted)
                                to_snatch = to_snatch + 1
                                issues.append(issue)

                                controlValueDict = {"NZBurl": nzburl}
                                newValueDict = {
                                    "NZBdate": formatter.now(),  # when we asked for it
                                    "Status": "Wanted"
                                }
                                myDB.upsert("wanted", newValueDict, controlValueDict)

                            else:
                                logger.debug('This issue of %s is already flagged for download' % issue)
                        else:
                            if newdatish != "1970-01-01":  # this is our fake date for ones we can't decipher
                                logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted)
                                old_date = old_date + 1
                    else:
                        logger.debug('Magazine [%s] does not completely match search term [%s].' % (
                                     nzbtitle_formatted, bookid))
                        bad_regex = bad_regex + 1

            logger.info('Found %i results for %s. %i new, %i old, %i fail date, %i fail name: %i to download' % (
                        total_nzbs, bookid, new_date, old_date, bad_date, bad_regex, to_snatch))

            for items in maglist:
                if items['nzbmode'] == "torznab":
                    snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                elif items['nzbmode'] == "torrent":
                    snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                else:
                    snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                if snatch:
                    notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now())
                    common.schedule_job(action='Start', target='processDir')
            maglist = []

    if reset:
        common.schedule_job(action='Restart', target='search_magazines')

    logger.info("Search for magazines complete")

예제 #13

파일 보기

파일: postprocess.py 프로젝트: dmfinley/LazyLibrarian

def processDir(force=False, reset=False):
    # rename this thread
    threading.currentThread().name = "POSTPROCESS"

    if not lazylibrarian.DOWNLOAD_DIR or not os.path.isdir(lazylibrarian.DOWNLOAD_DIR):
        processpath = os.getcwd()
    else:
        processpath = lazylibrarian.DOWNLOAD_DIR

    logger.debug(' Checking [%s] for files to post process' % processpath)

    try:
        downloads = os.listdir(processpath)
    except OSError as why:
        logger.error('Could not access [%s] directory [%s]' % (processpath, why.strerror))
        return False

    myDB = database.DBConnection()
    snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"')

    if force is False and len(snatched) == 0:
        logger.info('Nothing marked as snatched. Stopping postprocessor job.')
        common.schedule_job(action='Stop', target='processDir')
    elif len(downloads) == 0:
        logger.info('No downloads are found. Nothing to process.')
    else:
        logger.debug("Checking %s downloads for %s snatched files" % (len(downloads), len(snatched)))
        ppcount = 0
        for book in snatched:
            found = False
            for fname in downloads:
                if not fname.endswith('.fail'):  # has this failed before?
                    # this is to get round differences in torrent filenames.
                    # Torrents aren't always returned with the name we searched for
                    # there might be a better way...
                    if isinstance(fname, str):
                        matchname = fname.decode(lazylibrarian.SYS_ENCODING)
                    else:
                        matchname = fname
                    if ' LL.(' in matchname:
                        matchname = matchname.split(' LL.(')[0]
                    matchtitle = book['NZBtitle']
                    match = 0
                    if matchtitle:
                        if ' LL.(' in matchtitle:
                            matchtitle = matchtitle.split(' LL.(')[0]
                        match = fuzz.token_set_ratio(matchtitle, matchname)
                    if match >= 95:
                        fname = matchname
                        if os.path.isfile(os.path.join(processpath, fname)):
                            # handle single file downloads here...
                            if formatter.is_valid_booktype(fname, booktype="book") \
                                or formatter.is_valid_booktype(fname, booktype="mag"):
                                dirname = os.path.join(processpath, os.path.splitext(fname)[0])
                                if not os.path.exists(dirname):
                                    try:
                                        os.makedirs(dirname)
                                    except OSError as why:
                                        logger.debug('Failed to create directory %s, %s' % (dirname, why.strerror))
                                if os.path.exists(dirname):
                                    try:
                                        shutil.move(os.path.join(processpath, fname), os.path.join(dirname, fname))
                                        fname = os.path.splitext(fname)[0]
                                    except Exception as why:
                                        logger.debug("Failed to move file %s to %s, %s" %
                                            (fname, dirname, str(why)))
                        if os.path.isdir(os.path.join(processpath, fname)):
                            pp_path = os.path.join(processpath, fname)
                            logger.debug('Found folder %s for %s' % (pp_path, book['NZBtitle']))
                            found = True
                            break
                    else:
                        logger.debug('No match (%s%%) %s for %s' % (match, matchname, matchtitle))
                else:
                    logger.debug('Skipping %s' % fname)

            if found:
                data = myDB.select('SELECT * from books WHERE BookID="%s"' % book['BookID'])
                if data:
                    authorname = data[0]['AuthorName']
                    bookname = data[0]['BookName']
                    if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER:
                        logger.warn('Please check your EBOOK_DEST_FOLDER setting')
                        lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\')

                    # Default destination path, should be allowed change per config file.
                    dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace(
                        '$Title', bookname)
                    global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace(
                        '$Title', bookname)
                    global_name = common.remove_accents(global_name)
                    # dest_path = authorname+'/'+bookname
                    # global_name = bookname + ' - ' + authorname
                    # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR
                    # as windows drive identifiers have colon, eg c:  but no colons allowed elsewhere?
                    dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's',
                           ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''}
                    dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic))
                    dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(
                        lazylibrarian.SYS_ENCODING)
                else:
                    data = myDB.select('SELECT * from magazines WHERE Title="%s"' % book['BookID'])
                    if data:
                        # AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple
                        # files are downloading, there will be an error in post-processing, trying to go to the
                        # same directory.
                        mostrecentissue = data[0]['IssueDate']  # keep for processing issues arriving out of order
                        # Remove characters we don't want in the filename before (maybe) adding to DESTINATION_DIR
                        # as windows drive identifiers have colon, eg c:  but no colons allowed elsewhere?
                        dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's',
                               ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''}
                        mag_name = formatter.latinToAscii(formatter.replace_all(book['BookID'], dic))
                        # book auxinfo is a cleaned date, eg 2015-01-01
                        dest_path = lazylibrarian.MAG_DEST_FOLDER.replace(
                            '$IssueDate',
                            book['AuxInfo']).replace('$Title', mag_name)
                        # dest_path = '_Magazines/'+title+'/'+book['AuxInfo']
                        if lazylibrarian.MAG_RELATIVE:
                            if dest_path[0] not in '._':
                                dest_path = '_' + dest_path
                            dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(
                                lazylibrarian.SYS_ENCODING)
                        else:
                            dest_path = dest_path.encode(lazylibrarian.SYS_ENCODING)
                        authorname = None
                        bookname = None
                        global_name = lazylibrarian.MAG_DEST_FILE.replace('$IssueDate', book['AuxInfo']).replace(
                            '$Title', mag_name)
                        global_name = common.remove_accents(global_name)
                        # global_name = book['AuxInfo']+' - '+title
                    else:
                        logger.debug("Snatched magazine %s is not in download directory" % (book['BookID']))
                        continue
            else:
                logger.debug("Snatched %s %s is not in download directory" % (book['NZBmode'], book['NZBtitle']))
                continue

            # try:
            #    os.chmod(dest_path, 0777)
            # except Exception, e:
            #    logger.debug("Could not chmod post-process directory: " + str(dest_path))

            processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name)

            if processBook:
                logger.debug("Processing %s, %s" % (global_name, book['NZBurl']))
                # update nzbs, only update the snatched ones in case multiple matches for same book / magazine issue
                controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"}
                newValueDict = {"Status": "Processed", "NZBDate": formatter.now()}  # say when we processed it
                myDB.upsert("wanted", newValueDict, controlValueDict)

                if bookname is not None:  # it's a book, if None it's a magazine
                    processExtras(myDB, dest_path, global_name, data)
                else:
                    # update mags
                    controlValueDict = {"Title": book['BookID']}
                    if mostrecentissue:
                        if mostrecentissue.isdigit() and str(book['AuxInfo']).isdigit():
                            older = int(mostrecentissue) > int(book['AuxInfo']) # issuenumber
                        else:
                            older = mostrecentissue > book['AuxInfo']  # YYYY-MM-DD
                    else:
                        older = False
                    if older:  # check this in case processing issues arriving out of order
                        newValueDict = {"LastAcquired": formatter.today(), "IssueStatus": "Open"}
                    else:
                        newValueDict = {"IssueDate": book['AuxInfo'], "LastAcquired": formatter.today(),
                                        "IssueStatus": "Open"}
                    myDB.upsert("magazines", newValueDict, controlValueDict)
                    # dest_path is where we put the magazine after processing, but we don't have the full filename
                    # so look for any "book" in that directory
                    dest_file = book_file(dest_path, booktype='mag')
                    controlValueDict = {"Title": book['BookID'], "IssueDate": book['AuxInfo']}
                    newValueDict = {"IssueAcquired": formatter.today(),
                                    "IssueFile": dest_file,
                                    "IssueID": magazinescan.create_id("%s %s" % (book['BookID'], book['AuxInfo']))
                                    }
                    myDB.upsert("issues", newValueDict, controlValueDict)

                    # create a thumbnail cover for the new issue
                    magazinescan.create_cover(dest_file)

                logger.info('Successfully processed: %s' % global_name)
                ppcount = ppcount + 1
                notifiers.notify_download(formatter.latinToAscii(global_name) + ' at ' + formatter.now())
            else:
                logger.error('Postprocessing for %s has failed.' % global_name)
                logger.error('Warning - Residual files remain in %s.fail' % pp_path)
                # at this point, as it failed we should move it or it will get postprocessed
                # again (and fail again)
                try:
                    os.rename(pp_path, pp_path + '.fail')
                except:
                    logger.debug("Unable to rename %s" % pp_path)

        downloads = os.listdir(processpath)  # check in case we processed/deleted some above
        for directory in downloads:
            if "LL.(" in directory and not directory.endswith('.fail'):
                bookID = str(directory).split("LL.(")[1].split(")")[0]
                logger.debug("Book with id: " + str(bookID) + " is in downloads")
                pp_path = os.path.join(processpath, directory)

                if os.path.isfile(pp_path):
                    pp_path = os.path.join(processpath)

                if (os.path.isdir(pp_path)):
                    logger.debug('Found LL folder %s.' % pp_path)
                if import_book(pp_path, bookID):
                    ppcount = ppcount + 1

        if ppcount == 0:
            logger.info('No snatched books/mags have been found')
        elif ppcount == 1:
            logger.info('1 book/mag has been processed.')
        else:
            logger.info('%s books/mags have been processed.' % ppcount)
            
    if reset:
        common.schedule_job(action='Restart', target='processDir')

예제 #14

파일 보기

파일: librarysync.py 프로젝트: duckville/LazyLibrarian

def LibraryScan(dir=None):
    if not dir:
        if not lazylibrarian.DOWNLOAD_DIR:
            return
        else:
            dir = lazylibrarian.DOWNLOAD_DIR

    if not os.path.isdir(dir):
        logger.warn("Cannot find directory: %s. Not scanning" % dir.decode(lazylibrarian.SYS_ENCODING, "replace"))
        return

    myDB = database.DBConnection()

    myDB.action("drop table if exists stats")
    myDB.action(
        "create table stats (authorname text, GR_book_hits int, GR_lang_hits int, LT_lang_hits int, \
                            GB_lang_change, cache_hits int, bad_lang int, bad_char int, uncached int )"
    )

    logger.info("Scanning ebook directory: %s" % dir.decode(lazylibrarian.SYS_ENCODING, "replace"))

    new_book_count = 0
    file_count = 0

    if lazylibrarian.FULL_SCAN:
        books = myDB.select('select AuthorName, BookName, BookFile, BookID from books where Status="Open"')
        status = lazylibrarian.NOTFOUND_STATUS
        logger.info("Missing books will be marked as %s" % status)
        for book in books:
            bookName = book["BookName"]
            bookAuthor = book["AuthorName"]
            bookID = book["BookID"]
            bookfile = book["BookFile"]

            if not (bookfile and os.path.isfile(bookfile)):
                myDB.action('update books set Status="%s" where BookID="%s"' % (status, bookID))
                myDB.action('update books set BookFile="" where BookID="%s"' % bookID)
                logger.warn("Book %s - %s updated as not found on disk" % (bookAuthor, bookName))

    # guess this was meant to save repeat-scans of the same directory
    # if it contains multiple formats of the same book, but there was no code
    # that looked at the array. renamed from latest to processed to make
    # purpose clearer
    processed_subdirectories = []

    matchString = ""
    for char in lazylibrarian.EBOOK_DEST_FILE:
        matchString = matchString + "\\" + char
    # massage the EBOOK_DEST_FILE config parameter into something we can use
    # with regular expression matching
    booktypes = ""
    count = -1
    booktype_list = formatter.getList(lazylibrarian.EBOOK_TYPE)
    for book_type in booktype_list:
        count += 1
        if count == 0:
            booktypes = book_type
        else:
            booktypes = booktypes + "|" + book_type
    matchString = (
        matchString.replace("\\$\\A\\u\\t\\h\\o\\r", "(?P<author>.*?)").replace("\\$\\T\\i\\t\\l\\e", "(?P<book>.*?)")
        + "\.["
        + booktypes
        + "]"
    )
    pattern = re.compile(matchString, re.VERBOSE)

    for r, d, f in os.walk(dir):
        for directory in d[:]:
            if directory.startswith("."):
                d.remove(directory)
            # prevent magazine being scanned
            if directory.startswith("_"):
                d.remove(directory)
        for files in f:
            file_count += 1
            subdirectory = r.replace(dir, "")
            # Added new code to skip if we've done this directory before.
            # Made this conditional with a switch in config.ini
            # in case user keeps multiple different books in the same subdirectory
            if (lazylibrarian.IMP_SINGLEBOOK) and (subdirectory in processed_subdirectories):
                logger.debug("[%s] already scanned" % subdirectory)
            else:
                # If this is a book, try to get author/title/isbn/language
                # if epub or mobi, read metadata from the book
                # If metadata.opf exists, use that allowing it to override
                # embedded metadata. User may have edited metadata.opf
                # to merge author aliases together
                # If all else fails, try pattern match for author/title
                # and look up isbn/lang from LT or GR later
                match = 0

                if formatter.is_valid_booktype(files):
                    logger.debug(
                        "[%s] Now scanning subdirectory %s"
                        % (
                            dir.decode(lazylibrarian.SYS_ENCODING, "replace"),
                            subdirectory.decode(lazylibrarian.SYS_ENCODING, "replace"),
                        )
                    )
                    language = "Unknown"
                    isbn = ""
                    book = ""
                    author = ""
                    words = files.split(".")
                    extn = words[len(words) - 1]

                    # if it's an epub or a mobi we can try to read metadata from it
                    if (extn == "epub") or (extn == "mobi"):
                        book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING)
                        try:
                            res = get_book_info(book_filename)
                        except:
                            res = {}
                        if "title" in res and "creator" in res:  # this is the minimum we need
                            match = 1
                            book = res["title"]
                            author = res["creator"]
                            if "language" in res:
                                language = res["language"]
                            if "identifier" in res:
                                isbn = res["identifier"]
                            if "type" in res:
                                extn = res["type"]
                            logger.debug("book meta [%s] [%s] [%s] [%s] [%s]" % (isbn, language, author, book, extn))
                        else:
                            logger.debug("Book meta incomplete in %s" % book_filename)

                    # calibre uses "metadata.opf", LL uses "bookname - authorname.opf"
                    # just look for any .opf file in the current directory since we don't know
                    # LL preferred authorname/bookname at this point.
                    # Allow metadata in file to override book contents as may be users pref
                    metafile = opf_file(r)
                    try:
                        res = get_book_info(metafile)
                    except:
                        res = {}
                    if "title" in res and "creator" in res:  # this is the minimum we need
                        match = 1
                        book = res["title"]
                        author = res["creator"]
                        if "language" in res:
                            language = res["language"]
                        if "identifier" in res:
                            isbn = res["identifier"]
                        logger.debug("file meta [%s] [%s] [%s] [%s]" % (isbn, language, author, book))
                    else:
                        logger.debug("File meta incomplete in %s" % metafile)

                    if not match:  # no author/book from metadata file, and not embedded either
                        match = pattern.match(files)
                        if match:
                            author = match.group("author")
                            book = match.group("book")
                        else:
                            logger.debug("Pattern match failed [%s]" % files)

                    if match:
                        # flag that we found a book in this subdirectory
                        processed_subdirectories.append(subdirectory)

                        # If we have a valid looking isbn, and language != "Unknown", add it to cache
                        if language != "Unknown" and formatter.is_valid_isbn(isbn):
                            logger.debug("Found Language [%s] ISBN [%s]" % (language, isbn))
                            # we need to add it to language cache if not already
                            # there, is_valid_isbn has checked length is 10 or 13
                            if len(isbn) == 10:
                                isbnhead = isbn[0:3]
                            else:
                                isbnhead = isbn[3:6]
                            match = myDB.action('SELECT lang FROM languages where isbn = "%s"' % (isbnhead)).fetchone()
                            if not match:
                                myDB.action('insert into languages values ("%s", "%s")' % (isbnhead, language))
                                logger.debug("Cached Lang [%s] ISBN [%s]" % (language, isbnhead))
                            else:
                                logger.debug("Already cached Lang [%s] ISBN [%s]" % (language, isbnhead))

                        # get authors name in a consistent format
                        if "," in author:  # "surname, forename"
                            words = author.split(",")
                            author = words[1].strip() + " " + words[0].strip()  # "forename surname"
                        if author[1] == " ":
                            author = author.replace(" ", ".")
                            author = author.replace("..", ".")

                        # Check if the author exists, and import the author if not,
                        # before starting any complicated book-name matching to save repeating the search
                        #
                        check_exist_author = myDB.action(
                            'SELECT * FROM authors where AuthorName="%s"' % author
                        ).fetchone()
                        if not check_exist_author and lazylibrarian.ADD_AUTHOR:
                            # no match for supplied author, but we're allowed to
                            # add new ones

                            GR = GoodReads(author)
                            try:
                                author_gr = GR.find_author_id()
                            except:
                                logger.warn("Error finding author id for [%s]" % author)
                                continue

                            # only try to add if GR data matches found author data
                            if author_gr:
                                authorname = author_gr["authorname"]

                                # "J.R.R. Tolkien" is the same person as "J. R. R. Tolkien" and "J R R Tolkien"
                                match_auth = author.replace(".", "_")
                                match_auth = match_auth.replace(" ", "_")
                                match_auth = match_auth.replace("__", "_")
                                match_name = authorname.replace(".", "_")
                                match_name = match_name.replace(" ", "_")
                                match_name = match_name.replace("__", "_")
                                match_name = common.remove_accents(match_name)
                                match_auth = common.remove_accents(match_auth)
                                # allow a degree of fuzziness to cater for different accented character handling.
                                # some author names have accents,
                                # filename may have the accented or un-accented version of the character
                                # The currently non-configurable value of fuzziness might need to go in config
                                # We stored GoodReads unmodified author name in
                                # author_gr, so store in LL db under that
                                # fuzz.ratio doesn't lowercase for us
                                match_fuzz = fuzz.ratio(match_auth.lower(), match_name.lower())
                                if match_fuzz < 90:
                                    logger.debug("Failed to match author [%s] fuzz [%d]" % (author, match_fuzz))
                                    logger.debug(
                                        "Failed to match author [%s] to authorname [%s]" % (match_auth, match_name)
                                    )

                                # To save loading hundreds of books by unknown
                                # authors at GR or GB, ignore if author "Unknown"
                                if (author != "Unknown") and (match_fuzz >= 90):
                                    # use "intact" name for author that we stored in
                                    # GR author_dict, not one of the various mangled versions
                                    # otherwise the books appear to be by a different author!
                                    author = author_gr["authorname"]
                                    # this new authorname may already be in the
                                    # database, so check again
                                    check_exist_author = myDB.action(
                                        'SELECT * FROM authors where AuthorName="%s"' % author
                                    ).fetchone()
                                    if not check_exist_author:
                                        logger.debug("Adding new author [%s]" % author)
                                        try:
                                            importer.addAuthorToDB(author)
                                            check_exist_author = myDB.action(
                                                'SELECT * FROM authors where AuthorName="%s"' % author
                                            ).fetchone()
                                        except:
                                            continue

                        # check author exists in db, either newly loaded or already there
                        if not check_exist_author:
                            logger.debug("Failed to match author [%s] in database" % author)
                        else:
                            # author exists, check if this book by this author is in our database
                            # metadata might have quotes in book name
                            book = book.replace('"', "").replace("'", "")
                            bookid = find_book_in_db(myDB, author, book)
                            if bookid:
                                # check if book is already marked as "Open" (if so,
                                # we already had it)
                                check_status = myDB.action(
                                    'SELECT Status from books where BookID="%s"' % bookid
                                ).fetchone()
                                if check_status["Status"] != "Open":
                                    # update status as we've got this book
                                    myDB.action('UPDATE books set Status="Open" where BookID="%s"' % bookid)
                                    book_filename = os.path.join(r, files).encode(lazylibrarian.SYS_ENCODING)
                                    # update book location so we can check if it
                                    # gets removed, or allow click-to-open
                                    myDB.action(
                                        'UPDATE books set BookFile="%s" where BookID="%s"' % (book_filename, bookid)
                                    )
                                    new_book_count += 1

    cachesize = myDB.action("select count('ISBN') as counter from languages").fetchone()
    logger.info("%s new/modified books found and added to the database" % new_book_count)
    logger.info("%s files processed" % file_count)
    stats = myDB.action(
        "SELECT sum(GR_book_hits), sum(GR_lang_hits), sum(LT_lang_hits), sum(GB_lang_change), \
            sum(cache_hits), sum(bad_lang), sum(bad_char), sum(uncached) FROM stats"
    ).fetchone()
    if stats["sum(GR_book_hits)"] is not None:
        # only show stats if new books added
        if lazylibrarian.BOOK_API == "GoogleBooks":
            logger.debug("GoogleBooks was hit %s times for books" % stats["sum(GR_book_hits)"])
            logger.debug("GoogleBooks language was changed %s times" % stats["sum(GB_lang_change)"])
        if lazylibrarian.BOOK_API == "GoodReads":
            logger.debug("GoodReads was hit %s times for books" % stats["sum(GR_book_hits)"])
            logger.debug("GoodReads was hit %s times for languages" % stats["sum(GR_lang_hits)"])
        logger.debug("LibraryThing was hit %s times for languages" % stats["sum(LT_lang_hits)"])
        logger.debug("Language cache was hit %s times" % stats["sum(cache_hits)"])
        logger.debug("Unwanted language removed %s books" % stats["sum(bad_lang)"])
        logger.debug("Unwanted characters removed %s books" % stats["sum(bad_char)"])
        logger.debug("Unable to cache %s books with missing ISBN" % stats["sum(uncached)"])
    logger.debug("Cache %s hits, %s miss" % (lazylibrarian.CACHE_HIT, lazylibrarian.CACHE_MISS))
    logger.debug("ISBN Language cache holds %s entries" % cachesize["counter"])
    stats = len(myDB.select('select BookID from Books where status="Open" and BookLang="Unknown"'))
    if stats:
        logger.warn("There are %s books in your library with unknown language" % stats)

    authors = myDB.select("select AuthorName from authors")
    # Update bookcounts for all authors, not just new ones - refresh may have located
    # new books for existing authors especially if switched provider gb/gr
    logger.debug("Updating bookcounts for %i authors" % len(authors))
    for author in authors:
        name = author["AuthorName"]
        havebooks = myDB.action(
            'SELECT count("BookID") as counter from books WHERE AuthorName="%s" AND (Status="Have" OR Status="Open")'
            % name
        ).fetchone()
        myDB.action('UPDATE authors set HaveBooks="%s" where AuthorName="%s"' % (havebooks["counter"], name))
        totalbooks = myDB.action(
            'SELECT count("BookID") as counter FROM books WHERE AuthorName="%s" AND Status!="Ignored"' % name
        ).fetchone()
        myDB.action('UPDATE authors set UnignoredBooks="%s" where AuthorName="%s"' % (totalbooks["counter"], name))

    logger.info("Library scan complete")

예제 #15

파일 보기

파일: searchmag.py 프로젝트: duckville/LazyLibrarian

def search_magazines(mags=None):
    # produce a list of magazines to search for, tor, nzb, torznab

    myDB = database.DBConnection()
    searchlist = []
    threading.currentThread().name = "SEARCHMAGS"

    if mags is None:  # backlog search
        searchmags = myDB.select('SELECT Title, Frequency, LastAcquired, \
                                 IssueDate from magazines WHERE Status="Active"')
    else:
        searchmags = []
        for magazine in mags:
            searchmags_temp = myDB.select('SELECT Title, Frequency, LastAcquired, IssueDate from magazines \
                                          WHERE Title="%s" AND Status="Active"' % (magazine['bookid']))
            for terms in searchmags_temp:
                searchmags.append(terms)

    if len(searchmags) == 1:
        logger.info('Searching for one magazine')
    else:
        logger.info('Searching for %i magazines' % len(searchmags))

    for searchmag in searchmags:
        bookid = searchmag[0]
        searchterm = searchmag[0]
        frequency = searchmag[1]
        # last_acquired = searchmag[2]
        # issue_date = searchmag[3]

        dic = {'...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's', ' + ': ' ', '"': '', ',': '', '*': ''}

        searchterm = formatter.latinToAscii(formatter.replace_all(searchterm, dic))
        searchterm = re.sub('[\.\-\/]', ' ', searchterm).encode('utf-8')
        searchlist.append({"bookid": bookid, "searchterm": searchterm})

    if searchlist == []:
        logger.warn('There is nothing to search for.  Mark some magazines as active.')

    for book in searchlist:

        resultlist = []
        tor_resultlist = []
        if lazylibrarian.USE_NZB:
            resultlist, nproviders = providers.IterateOverNewzNabSites(book, 'mag')
            if not nproviders:
                logger.warn('No nzb providers are set. Check config for NEWZNAB or TORZNAB providers')

        if lazylibrarian.USE_TOR:
            tor_resultlist, nproviders = providers.IterateOverTorrentSites(book, 'mag')
            if not nproviders:
                logger.warn('No torrent providers are set. Check config for TORRENT providers')

            for item in tor_resultlist:  # reformat the torrent results so they look like nzbs
                resultlist.append({
                    'bookid': item['bookid'],
                    'nzbprov': item['tor_prov'],
                    'nzbtitle': item['tor_title'],
                    'nzburl': item['tor_url'],
                    'nzbdate': 'Fri, 01 Jan 1970 00:00:00 +0100',  # fake date as none returned from torrents
                    'nzbsize': item['tor_size'],
                    'nzbmode': 'torrent'
                })

        if not resultlist:
            logger.debug("Adding magazine %s to queue." % book['searchterm'])

        else:
            bad_regex = 0
            bad_date = 0
            old_date = 0
            total_nzbs = 0
            new_date = 0
            to_snatch = 0
            maglist = []
            issues = []
            for nzb in resultlist:
                total_nzbs = total_nzbs + 1
                bookid = nzb['bookid']
                nzbtitle = (u'%s' % nzb['nzbtitle'])
                nzbtitle = nzbtitle.replace('"', '').replace("'", "")  # suppress " in titles
                nzburl = nzb['nzburl']
                nzbprov = nzb['nzbprov']
                nzbdate_temp = nzb['nzbdate']
                nzbsize_temp = nzb['nzbsize']
                if nzbsize_temp is None:  # not all torrents returned by torznab have a size
                    nzbsize_temp = 1000
                nzbsize = str(round(float(nzbsize_temp) / 1048576, 2)) + ' MB'
                nzbdate = formatter.nzbdate2format(nzbdate_temp)
                nzbmode = nzb['nzbmode']
                checkifmag = myDB.select('SELECT * from magazines WHERE Title="%s"' % bookid)
                if checkifmag:
                    for results in checkifmag:
                        control_date = results['IssueDate']
                        frequency = results['Frequency']
                        # regex = results['Regex']

                    nzbtitle_formatted = nzbtitle.replace('.', ' ').replace('-', ' ').replace('/', ' ').replace(
                                            '+', ' ').replace('_', ' ').replace('(', '').replace(')', '').strip()
                    # Need to make sure that substrings of magazine titles don't get found
                    # (e.g. Maxim USA will find Maximum PC USA)
                    # keyword_check = nzbtitle_formatted.replace(bookid, '')
                    # remove extra spaces if they're in a row
                    nzbtitle_exploded_temp = " ".join(nzbtitle_formatted.split())
                    nzbtitle_exploded = nzbtitle_exploded_temp.split(' ')

                    if ' ' in bookid:
                        bookid_exploded = bookid.split(' ')
                    else:
                        bookid_exploded = [bookid]

                    # check nzb starts with magazine title, and ends with a date
                    # eg The MagPI Issue 22 - July 2015
                    # do something like check left n words match title
                    # then check last n words are a date

                    name_match = 1  # assume name matches for now
                    name_len = len(bookid_exploded)
                    if len(nzbtitle_exploded) > name_len:  # needs to be longer as it should include a date
                        while name_len:
                            name_len = name_len - 1
                            # fuzzy check on each word in the magazine name with any accents stripped
                            # fuzz.ratio doesn't lowercase for us
                            ratio = fuzz.ratio(common.remove_accents(nzbtitle_exploded[name_len].lower()),
                                               common.remove_accents(bookid_exploded[name_len].lower()))
                            if ratio < 80:  # hard coded fuzz ratio for now, works for close matches
                                logger.debug("Magazine fuzz ratio failed [%d] [%s] [%s]" % (
                                             ratio, bookid, nzbtitle_formatted))
                                name_match = 0  # name match failed
                    if name_match:
                        # some magazine torrent uploaders add their sig in [] or {}
                        # Fortunately for us, they always seem to add it at the end
                        # some magazine torrent titles are "magazine_name some_form_of_date pdf"
                        # so strip all the trailing junk...
                        while nzbtitle_exploded[len(nzbtitle_exploded) - 1][0] in '[{' or \
                            nzbtitle_exploded[len(nzbtitle_exploded) - 1].lower() == 'pdf':
                                nzbtitle_exploded.pop()  # gotta love the function names
                        
                        if len(nzbtitle_exploded) > 1:
                            # regexA = DD MonthName YYYY OR MonthName YYYY or nn MonthName YYYY
                            regexA_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1]
                            if regexA_year.isdigit():
                                if int(regexA_year) < 1900 or int(regexA_year) > 2100:
                                    regexA_year = 'Invalid'
                            regexA_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2]
                            regexA_month = formatter.month2num(common.remove_accents(regexA_month_temp))

                            if frequency == "Weekly" or frequency == "BiWeekly":
                                regexA_day = nzbtitle_exploded[len(nzbtitle_exploded) - 3].zfill(2)
                                if regexA_day.isdigit():
                                    if int(regexA_day) > 31:  # probably issue number nn
                                        regexA_day = '01'
                                else:
                                    regexA_day = '01'  # just MonthName YYYY
                            else:
                                regexA_day = '01'  # monthly, or less frequent
                            
                            newdatish_regexA = regexA_year + regexA_month + regexA_day

                            try:
                                int(newdatish_regexA)
                                newdatish = regexA_year + '-' + regexA_month + '-' + regexA_day
                            except:
                                # regexB = MonthName DD YYYY
                                regexB_year = nzbtitle_exploded[len(nzbtitle_exploded) - 1]
                                regexB_day = nzbtitle_exploded[len(nzbtitle_exploded) - 2].zfill(2)
                                regexB_month_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 3]
                                regexB_month = formatter.month2num(common.remove_accents(regexB_month_temp))
                                newdatish_regexB = regexB_year + regexB_month + regexB_day

                                try:
                                    int(newdatish_regexB)
                                    newdatish = regexB_year + '-' + regexB_month + '-' + regexB_day
                                except:
                                    # regexC = YYYY MM or YYYY MM DD or Issue nn YYYY
                                    # (can't get MM/DD if named Issue nn)
                                    newdatish_regexC = 'Invalid'  # invalid unless works out otherwise
                                    regexC_temp = nzbtitle_exploded[len(nzbtitle_exploded) - 2]
                                    if regexC_temp.isdigit():
                                        if int(regexC_temp) > 1900 and int(regexC_temp) < 2100:  # YYYY MM  or YYYY nn
                                            regexC_year = regexC_temp
                                            regexC_month = nzbtitle_exploded[len(nzbtitle_exploded) - 1].zfill(2)
                                            regexC_day = '01'
                                            if regexC_month.isdigit():  # could be YYYY nn where nn is issue number
                                                if int(regexC_month) < 13:
                                                    # if issue number > 12 date matching will fail
                                                    newdatish_regexC = regexC_year + regexC_month + regexC_day
                                        else:
                                            regexC_year = nzbtitle_exploded[len(nzbtitle_exploded) - 3]
                                            if regexC_year.isdigit():
                                                if int(regexC_year) > 1900 and int(regexC_year) < 2100:  # YYYY MM DD or YYYY nn-nn
                                                    regexC_month = regexC_temp.zfill(2)
                                                    if int(regexC_month) < 13:
                                                        # if issue number > 12 date matching will fail
                                                        regexC_day = nzbtitle_exploded[len(
                                                                nzbtitle_exploded) - 1].zfill(2)
                                                        newdatish_regexC = regexC_year + regexC_month + regexC_day

                                    try:
                                        int(newdatish_regexC)
                                        newdatish = regexC_year + '-' + regexC_month + '-' + regexC_day
                                    except:
                                        logger.debug('Magazine %s not in proper date format.' % nzbtitle_formatted)
                                        bad_date = bad_date + 1
                                        # allow issues with good name but bad date to be included
                                        # so user can manually select them
                                        newdatish = "1970-01-01"  # provide a fake date for bad-date issues
                                        # continue

                        else:
                            continue

                        # Don't want to overwrite status = Skipped for NZBs that have been previously found
                        wanted_status = myDB.select('SELECT * from wanted WHERE NZBtitle="%s"' % nzbtitle)
                        if wanted_status:
                            for results in wanted_status:
                                status = results['Status']
                        else:
                            status = "Skipped"

                        controlValueDict = {"NZBurl": nzburl}
                        newValueDict = {
                            "NZBprov": nzbprov,
                            "BookID": bookid,
                            "NZBdate": nzbdate,
                            "NZBtitle": nzbtitle,
                            "AuxInfo": newdatish,
                            "Status": status,
                            "NZBsize": nzbsize,
                            "NZBmode": nzbmode
                        }
                        myDB.upsert("wanted", newValueDict, controlValueDict)

                        if control_date is None:  # we haven't got any copies of this magazine yet
                            # get a rough time just over a month ago to compare to, in format yyyy-mm-dd
                            # could perhaps calc differently for weekly, biweekly etc
                            start_time = time.time()
                            start_time -= 31 * 24 * 60 * 60  # number of seconds in 31 days
                            control_date = time.strftime("%Y-%m-%d", time.localtime(start_time))

                        # only grab a copy if it's newer than the most recent we have,
                        # or newer than a month ago if we have none
                        comp_date = formatter.datecompare(newdatish, control_date)
                        if comp_date > 0:
                            # Should probably only upsert when downloaded and processed in case snatch fails
                            # keep track of what we're going to download so we don't download dupes
                            new_date = new_date + 1
                            issue = bookid + ',' + newdatish
                            if issue not in issues:
                                maglist.append({
                                    'bookid': bookid,
                                    'nzbprov': nzbprov,
                                    'nzbtitle': nzbtitle,
                                    'nzburl': nzburl,
                                    'nzbmode': nzbmode
                                })
                                logger.debug('This issue of %s is new, downloading' % nzbtitle_formatted)
                                to_snatch = to_snatch + 1
                                issues.append(issue)
                            else:
                                logger.debug('This issue of %s is already flagged for download' % issue)
                        else:
                            if newdatish != "1970-01-01":  # this is our fake date for ones we can't decipher
                                logger.debug('This issue of %s is old; skipping.' % nzbtitle_formatted)
                                old_date = old_date + 1
                    else:
                        logger.debug('Magazine [%s] does not completely match search term [%s].' % (
                                     nzbtitle_formatted, bookid))
                        bad_regex = bad_regex + 1

            logger.info('Found %s results for %s.  %s are new, %s are old, %s fail date, %s fail name matching' % (
                        total_nzbs, bookid, new_date, old_date, bad_date, bad_regex))
            logger.info("%s, %s issues to download" % (bookid, to_snatch))

            for items in maglist:
                if items['nzbmode'] == "torznab":
                    snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                elif items['nzbmode'] == "torrent":
                    snatch = TORDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                else:
                    snatch = NZBDownloadMethod(items['bookid'], items['nzbprov'], items['nzbtitle'], items['nzburl'])
                if snatch:
                    notifiers.notify_snatch(formatter.latinToAscii(items['nzbtitle']) + ' at ' + formatter.now())
                    postprocess.schedule_processor(action='Start')
            maglist = []
    logger.info("Search for magazines complete")

예제 #16

파일 보기

def processDir(force=False, reset=False):
    # rename this thread
    threading.currentThread().name = "POSTPROCESS"

    if not lazylibrarian.DOWNLOAD_DIR or not os.path.isdir(lazylibrarian.DOWNLOAD_DIR):
        processpath = os.getcwd()
    else:
        processpath = lazylibrarian.DOWNLOAD_DIR

    logger.debug(' Checking [%s] for files to post process' % processpath)

    try:
        downloads = os.listdir(processpath)
    except OSError as why:
        logger.error('Could not access [%s] directory [%s]' % (processpath, why.strerror))
        return False

    myDB = database.DBConnection()
    snatched = myDB.select('SELECT * from wanted WHERE Status="Snatched"')

    if force is False and len(snatched) == 0:
        logger.info('Nothing marked as snatched. Stopping postprocessor job.')
        common.schedule_job(action='Stop', target='processDir')
    elif len(downloads) == 0:
        logger.info('No downloads are found. Nothing to process.')
    else:
        logger.debug("Checking %s downloads for %s snatched files" % (len(downloads), len(snatched)))
        ppcount = 0
        for book in snatched:
            found = False
            for fname in downloads:
                if not fname.endswith('.fail'):  # has this failed before?
                    # this is to get round differences in torrent filenames.
                    # Torrents aren't always returned with the name we searched for
                    # there might be a better way...
                    if isinstance(fname, str):
                        matchname = fname.decode(lazylibrarian.SYS_ENCODING)
                    else:
                        matchname = fname
                    if ' LL.(' in matchname:
                        matchname = matchname.split(' LL.(')[0]
                    matchtitle = book['NZBtitle']
                    if ' LL.(' in matchtitle:
                        matchtitle = matchtitle.split(' LL.(')[0]
                    match = fuzz.token_set_ratio(matchtitle, matchname)
                    if match >= 95:
                        fname = matchname
                        if os.path.isfile(os.path.join(processpath, fname)):
                            # handle single file downloads here...
                            if formatter.is_valid_booktype(fname, booktype="book") \
                                or formatter.is_valid_booktype(fname, booktype="mag"):
                                dirname = os.path.join(processpath, os.path.splitext(fname)[0])
                                if not os.path.exists(dirname):
                                    try:
                                        os.makedirs(dirname)
                                    except OSError as why:
                                        logger.debug('Failed to create directory %s, %s' % (dirname, why.strerror))
                                if os.path.exists(dirname):
                                    try:
                                        shutil.move(os.path.join(processpath, fname), os.path.join(dirname, fname))
                                        fname = os.path.splitext(fname)[0]
                                    except Exception as why:
                                        logger.debug("Failed to move file %s to %s, %s" % 
                                            (fname, dirname, str(why)))                                         
                        if os.path.isdir(os.path.join(processpath, fname)): 
                            pp_path = os.path.join(processpath, fname)
                            logger.debug('Found folder %s for %s' % (pp_path, book['NZBtitle']))
                            found = True
                            break
                    else:
                        logger.debug('No match (%s%%) %s for %s' % (match, matchname, matchtitle))
                else:
                    logger.debug('Skipping %s' % fname)
            if found:
                data = myDB.select('SELECT * from books WHERE BookID="%s"' % book['BookID'])
                if data:
                    authorname = data[0]['AuthorName']
                    bookname = data[0]['BookName']
                    if 'windows' in platform.system().lower() and '/' in lazylibrarian.EBOOK_DEST_FOLDER:
                        logger.warn('Please check your EBOOK_DEST_FOLDER setting')
                        lazylibrarian.EBOOK_DEST_FOLDER = lazylibrarian.EBOOK_DEST_FOLDER.replace('/', '\\')

                    # Default destination path, should be allowed change per config file.
                    dest_path = lazylibrarian.EBOOK_DEST_FOLDER.replace('$Author', authorname).replace(
                        '$Title', bookname)
                    global_name = lazylibrarian.EBOOK_DEST_FILE.replace('$Author', authorname).replace(
                        '$Title', bookname)
                    global_name = common.remove_accents(global_name)
                    # dest_path = authorname+'/'+bookname
                    # global_name = bookname + ' - ' + authorname
                    # Remove characters we don't want in the filename BEFORE adding to DESTINATION_DIR
                    # as windows drive identifiers have colon, eg c:  but no colons allowed elsewhere?
                    dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's',
                           ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''}
                    dest_path = formatter.latinToAscii(formatter.replace_all(dest_path, dic))
                    dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(
                        lazylibrarian.SYS_ENCODING)
                else:
                    data = myDB.select('SELECT * from magazines WHERE Title="%s"' % book['BookID'])
                    if data:
                        # AuxInfo was added for magazine release date, normally housed in 'magazines' but if multiple
                        # files are downloading, there will be an error in post-processing, trying to go to the
                        # same directory.
                        mostrecentissue = data[0]['IssueDate']  # keep for processing issues arriving out of order
                        # Remove characters we don't want in the filename before (maybe) adding to DESTINATION_DIR
                        # as windows drive identifiers have colon, eg c:  but no colons allowed elsewhere?
                        dic = {'<': '', '>': '', '...': '', ' & ': ' ', ' = ': ' ', '?': '', '$': 's',
                               ' + ': ' ', '"': '', ',': '', '*': '', ':': '', ';': '', '\'': ''}
                        mag_name = formatter.latinToAscii(formatter.replace_all(book['BookID'], dic))
                        # book auxinfo is a cleaned date, eg 2015-01-01
                        dest_path = lazylibrarian.MAG_DEST_FOLDER.replace(
                            '$IssueDate',
                            book['AuxInfo']).replace('$Title',
                                                     mag_name)
                        # dest_path = '_Magazines/'+title+'/'+book['AuxInfo']
                        if lazylibrarian.MAG_RELATIVE:
                            if dest_path[0] not in '._':
                                dest_path = '_' + dest_path
                            dest_path = os.path.join(lazylibrarian.DESTINATION_DIR, dest_path).encode(
                                lazylibrarian.SYS_ENCODING)
                        else:
                            dest_path = dest_path.encode(lazylibrarian.SYS_ENCODING)
                        authorname = None
                        bookname = None
                        global_name = lazylibrarian.MAG_DEST_FILE.replace('$IssueDate', book['AuxInfo']).replace(
                            '$Title', mag_name)
                        global_name = common.remove_accents(global_name)
                        # global_name = book['AuxInfo']+' - '+title
                    else:
                        logger.debug("Snatched magazine %s is not in download directory" % (book['BookID']))
                        continue
            else:
                logger.debug("Snatched %s %s is not in download directory" % (book['NZBmode'], book['NZBtitle']))
                continue

            # try:
            #    os.chmod(dest_path, 0777)
            # except Exception, e:
            #    logger.debug("Could not chmod post-process directory: " + str(dest_path))

            processBook = processDestination(pp_path, dest_path, authorname, bookname, global_name)

            if processBook:
                logger.debug("Processing %s, %s" % (global_name, book['NZBurl']))
                # update nzbs, only update the snatched ones in case multiple matches for same book / magazine issue
                controlValueDict = {"NZBurl": book['NZBurl'], "Status": "Snatched"}
                newValueDict = {"Status": "Processed", "NZBDate": formatter.now()}  # say when we processed it
                myDB.upsert("wanted", newValueDict, controlValueDict)

                if bookname is not None:  # it's a book, if None it's a magazine
                    processExtras(myDB, dest_path, global_name, data)
                else:
                    # update mags
                    controlValueDict = {"Title": book['BookID']}
                    if mostrecentissue > book['AuxInfo']:  # check this in case processing issues arriving out of order
                        newValueDict = {"LastAcquired": formatter.today(), "IssueStatus": "Open"}
                    else:
                        newValueDict = {"IssueDate": book['AuxInfo'], "LastAcquired": formatter.today(),
                                        "IssueStatus": "Open"}
                    myDB.upsert("magazines", newValueDict, controlValueDict)
                    # dest_path is where we put the magazine after processing, but we don't have the full filename
                    # so look for any "book" in that directory
                    dest_file = book_file(dest_path, booktype='mag')
                    controlValueDict = {"Title": book['BookID'], "IssueDate": book['AuxInfo']}
                    newValueDict = {"IssueAcquired": formatter.today(),
                                    "IssueFile": dest_file,
                                    "IssueID": magazinescan.create_id("%s %s" % (book['BookID'], book['AuxInfo']))
                                    }
                    myDB.upsert("issues", newValueDict, controlValueDict)

                    # create a thumbnail cover for the new issue
                    magazinescan.create_cover(dest_file)

                logger.info('Successfully processed: %s' % global_name)
                ppcount = ppcount + 1
                notifiers.notify_download(formatter.latinToAscii(global_name) + ' at ' + formatter.now())
            else:
                logger.error('Postprocessing for %s has failed.' % global_name)
                logger.error('Warning - Residual files remain in %s.fail' % pp_path)
                # at this point, as it failed we should move it or it will get postprocessed
                # again (and fail again)
                try:
                    os.rename(pp_path, pp_path + '.fail')
                except:
                    logger.debug("Unable to rename %s" % pp_path)

        downloads = os.listdir(processpath)  # check in case we processed/deleted some above
        for directory in downloads:
            if "LL.(" in directory and not directory.endswith('.fail'):
                bookID = str(directory).split("LL.(")[1].split(")")[0]
                logger.debug("Book with id: " + str(bookID) + " is in downloads")
                pp_path = os.path.join(processpath, directory)

                if os.path.isfile(pp_path):
                    pp_path = os.path.join(processpath)

                if (os.path.isdir(pp_path)):
                    logger.debug('Found LL folder %s.' % pp_path)
                if import_book(pp_path, bookID):
                    ppcount = ppcount + 1

        if ppcount:
            logger.info('%s books/mags have been processed.' % ppcount)
        else:
            logger.info('No snatched books/mags have been found')
    if reset:
        common.schedule_job(action='Restart', target='processDir')