Exemplo n.º 1
0
class BookSorter:
    def __init__(self,
                 file_list,
                 mode,
                 database_path,
                 settings,
                 temp_dir=None):
        # Have the GUI pass a list of files straight to here
        # Then, on the basis of what is needed, pass the
        # filenames to the requisite functions
        # This includes getting file info for the database
        # Parsing for the reader proper
        # Caching upon closing
        self.file_list = [i for i in file_list if os.path.exists(i)]
        self.statistics = [0, (len(file_list))]
        self.hashes_and_paths = {}
        self.work_mode = mode[0]
        self.addition_mode = mode[1]
        self.database_path = database_path
        self.auto_tags = settings['auto_tags']
        self.auto_cover = settings['auto_cover']
        self.temp_dir = temp_dir
        if database_path:
            self.database_hashes()

        self.threading_completed = []
        self.queue = Manager().Queue()
        self.errors = Manager().list()
        self.processed_books = []

        if self.work_mode == 'addition':
            progress_object_generator()

    def database_hashes(self):
        all_hashes_and_paths = database.DatabaseFunctions(
            self.database_path).fetch_data(('Hash', 'Path'), 'books',
                                           {'Hash': ''}, 'LIKE')

        if all_hashes_and_paths:
            self.hashes_and_paths = {i[0]: i[1] for i in all_hashes_and_paths}

    def database_entry_for_book(self, file_hash):
        database_return = database.DatabaseFunctions(
            self.database_path).fetch_data(
                ('Title', 'Author', 'Year', 'ISBN', 'Tags', 'Position',
                 'Bookmarks', 'CoverImage', 'Annotations'), 'books',
                {'Hash': file_hash}, 'EQUALS')[0]

        book_data = []

        for count, i in enumerate(database_return):
            if count in (
                    5, 6,
                    8):  # Position, Bookmarks, and Annotations are pickled
                if i:
                    book_data.append(pickle.loads(i))
                else:
                    book_data.append(None)
            else:
                book_data.append(i)

        return book_data

    def read_book(self, filename):
        # filename is expected as a string containg the
        # full path of the ebook file

        with open(filename, 'rb') as current_book:
            # This should speed up addition for larger files
            # without compromising the integrity of the process
            first_bytes = current_book.read(1024 *
                                            32)  # First 32KB of the file
            file_md5 = hashlib.md5(first_bytes).hexdigest()

        # Update the progress queue
        self.queue.put(filename)

        # This should not get triggered in reading mode
        # IF the file is NOT being loaded into the reader

        # Do not allow addition in case the file
        # is already in the database and it remains at its original path
        if self.work_mode == 'addition' and file_md5 in self.hashes_and_paths:
            if (self.hashes_and_paths[file_md5] == filename
                    or os.path.exists(self.hashes_and_paths[file_md5])):

                if not self.hashes_and_paths[file_md5] == filename:
                    warning_string = (
                        f'{os.path.basename(filename)} is already in database')
                    logger.warning(warning_string)
                return

        # This allows for eliminating issues with filenames that have
        # a dot in them. All hail the roundabout fix.
        valid_extension = False
        for i in sorter:
            if os.path.basename(filename).endswith(i):
                file_extension = i
                valid_extension = True
                break

        if not valid_extension:
            this_error = 'Unsupported extension: ' + filename
            self.errors.append(this_error)
            logger.error(this_error)
            return

        book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)

        # None of the following have an exception type specified
        # This will keep everything from crashing, but will make
        # troubleshooting difficult
        # TODO
        # In application notifications

        try:
            book_ref.read_book()
        except Exception as e:
            this_error = f'Error initializing: {filename}'
            self.errors.append(this_error)
            logger.exception(this_error +
                             f' {type(e).__name__} Arguments: {e.args}')
            return

        this_book = {}
        this_book[file_md5] = {'hash': file_md5, 'path': filename}

        # Different modes require different values
        if self.work_mode == 'addition':
            try:
                metadata = book_ref.generate_metadata()
            except Exception as e:
                this_error = f'Metadata generation error: {filename}'
                self.errors.append(this_error)
                logger.exception(this_error +
                                 f' {type(e).__name__} Arguments: {e.args}')
                return

            title = metadata.title
            author = metadata.author
            year = metadata.year
            isbn = metadata.isbn

            tags = None
            if self.auto_tags:
                tags = metadata.tags

            cover_image_raw = metadata.cover
            if cover_image_raw:
                cover_image = resize_image(cover_image_raw)
            else:
                cover_image = None
                if self.auto_cover:
                    cover_image = fetch_cover(title, author)

            this_book[file_md5]['cover_image'] = cover_image
            this_book[file_md5]['addition_mode'] = self.addition_mode

        if self.work_mode == 'reading':
            try:
                book_breakdown = book_ref.generate_content()
            except Exception as e:
                this_error = f'Content generation error: {filename}'
                self.errors.append(this_error)
                logger.exception(this_error +
                                 f' {type(e).__name__} Arguments: {e.args}')
                return

            toc = book_breakdown[0]
            content = book_breakdown[1]
            images_only = book_breakdown[2]

            try:
                book_data = self.database_entry_for_book(file_md5)
            except TypeError:
                logger.error(
                    f'Database error: {filename}. Re-add book to program')
                return

            title = book_data[0].replace('&', '&&')
            author = book_data[1]
            year = book_data[2]
            isbn = book_data[3]
            tags = book_data[4]
            position = book_data[5]
            bookmarks = book_data[6]
            cover = book_data[7]
            annotations = book_data[8]

            this_book[file_md5]['position'] = position
            this_book[file_md5]['bookmarks'] = bookmarks
            this_book[file_md5]['toc'] = toc
            this_book[file_md5]['content'] = content
            this_book[file_md5]['images_only'] = images_only
            this_book[file_md5]['cover'] = cover
            this_book[file_md5]['annotations'] = annotations

        this_book[file_md5]['title'] = title
        this_book[file_md5]['author'] = author
        this_book[file_md5]['year'] = year
        this_book[file_md5]['isbn'] = isbn
        this_book[file_md5]['tags'] = tags

        return this_book

    def read_progress(self):
        while True:
            processed_file = self.queue.get()
            self.threading_completed.append(processed_file)

            total_number = len(self.file_list)
            completed_number = len(self.threading_completed)

            # Just for the record, this slows down book searching by about 20%
            if _progress_emitter:  # Skip update in reading mode
                _progress_emitter.update_progress(completed_number * 100 //
                                                  total_number)

            if total_number == completed_number:
                break

    def initiate_threads(self):
        if not self.file_list:
            return None

        def pool_creator():
            _pool = Pool(thread_count)
            self.processed_books = _pool.map(self.read_book, self.file_list)

            _pool.close()
            _pool.join()

        start_time = time.time()

        worker_thread = threading.Thread(target=pool_creator)
        progress_thread = threading.Thread(target=self.read_progress)
        worker_thread.start()
        progress_thread.start()

        worker_thread.join()
        progress_thread.join(timeout=.5)

        return_books = {}
        # Exclude None returns generated in case of duplication / parse errors
        self.processed_books = [i for i in self.processed_books if i]
        for i in self.processed_books:
            for j in i:
                return_books[j] = i[j]

        del self.processed_books
        processing_time = str(time.time() - start_time)
        logger.info('Finished processing in ' + processing_time)

        return return_books, self.errors
Exemplo n.º 2
0
class BookSorter:
    def __init__(self,
                 file_list,
                 mode,
                 database_path,
                 auto_tags=True,
                 temp_dir=None):
        # Have the GUI pass a list of files straight to here
        # Then, on the basis of what is needed, pass the
        # filenames to the requisite functions
        # This includes getting file info for the database
        # Parsing for the reader proper
        # Caching upon closing
        self.file_list = [i for i in file_list if os.path.exists(i)]
        self.statistics = [0, (len(file_list))]
        self.hashes_and_paths = {}
        self.work_mode = mode[0]
        self.addition_mode = mode[1]
        self.database_path = database_path
        self.auto_tags = auto_tags
        self.temp_dir = temp_dir
        if database_path:
            self.database_hashes()

        self.threading_completed = []
        self.queue = Manager().Queue()
        self.processed_books = []

        if self.work_mode == 'addition':
            progress_object_generator()

    def database_hashes(self):
        all_hashes_and_paths = database.DatabaseFunctions(
            self.database_path).fetch_data(('Hash', 'Path'), 'books',
                                           {'Hash': ''}, 'LIKE')

        if all_hashes_and_paths:
            # self.hashes = [i[0] for i in all_hashes]
            self.hashes_and_paths = {i[0]: i[1] for i in all_hashes_and_paths}

    def database_entry_for_book(self, file_hash):
        database_return = database.DatabaseFunctions(
            self.database_path).fetch_data(
                ('Title', 'Author', 'Year', 'ISBN', 'Tags', 'Position',
                 'Bookmarks', 'CoverImage', 'Annotations'), 'books',
                {'Hash': file_hash}, 'EQUALS')[0]

        book_data = []

        for count, i in enumerate(database_return):
            if count in (
                    5, 6,
                    8):  # Position, Bookmarks, and Annotations are pickled
                if i:
                    book_data.append(pickle.loads(i))
                else:
                    book_data.append(None)
            else:
                book_data.append(i)

        return book_data

    def read_book(self, filename):
        # filename is expected as a string containg the
        # full path of the ebook file

        with open(filename, 'rb') as current_book:
            # This should speed up addition for larger files
            # without compromising the integrity of the process
            first_bytes = current_book.read(1024 *
                                            32)  # First 32KB of the file
            file_md5 = hashlib.md5(first_bytes).hexdigest()

        # Update the progress queue
        self.queue.put(filename)

        # This should not get triggered in reading mode
        # IF the file is NOT being loaded into the reader

        # Do not allow addition in case the file
        # is already in the database and it remains at its original path
        if self.work_mode == 'addition' and file_md5 in self.hashes_and_paths:
            if (self.hashes_and_paths[file_md5] == filename
                    or os.path.exists(self.hashes_and_paths[file_md5])):

                if not self.hashes_and_paths[file_md5] == filename:
                    print(
                        f'{os.path.basename(filename)} is already in database')
                return

        # This allows for eliminating issues with filenames that have
        # a dot in them. All hail the roundabout fix.
        valid_extension = False
        for i in sorter:
            if os.path.basename(filename).endswith(i):
                file_extension = i
                valid_extension = True
                break

        if not valid_extension:
            print(filename + ' has an unsupported extension')
            return

        book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)

        # Everything following this is standard
        # None values are accounted for here
        is_valid = book_ref.read_book()
        if not is_valid:
            print('Cannot parse: ' + filename)
            return

        if book_ref.book:

            this_book = {}
            this_book[file_md5] = {'hash': file_md5, 'path': filename}

            # Different modes require different values
            if self.work_mode == 'addition':
                # Reduce the size of the incoming image
                # if one is found
                title = book_ref.get_title()
                author = book_ref.get_author()
                year = book_ref.get_year()
                isbn = book_ref.get_isbn()

                tags = None
                if self.auto_tags:
                    tags = book_ref.get_tags()

                cover_image_raw = book_ref.get_cover_image()
                if cover_image_raw:
                    cover_image = resize_image(cover_image_raw)
                else:
                    cover_image = None

                this_book[file_md5]['cover_image'] = cover_image
                this_book[file_md5]['addition_mode'] = self.addition_mode

            if self.work_mode == 'reading':
                all_content = book_ref.get_contents()

                # get_contents() returns a tuple. Index 1 is a collection of
                # special settings that depend on the kind of data being parsed.
                # Currently, this includes:
                # Only images included      images_only     BOOL    Book contains only images

                content = all_content[0]
                images_only = all_content[1]['images_only']

                if not content:
                    content = [('Invalid', 'Something went horribly wrong')]

                book_data = self.database_entry_for_book(file_md5)
                title = book_data[0]
                author = book_data[1]
                year = book_data[2]
                isbn = book_data[3]
                tags = book_data[4]
                position = book_data[5]
                bookmarks = book_data[6]
                cover = book_data[7]
                annotations = book_data[8]

                this_book[file_md5]['position'] = position
                this_book[file_md5]['bookmarks'] = bookmarks
                this_book[file_md5]['content'] = content
                this_book[file_md5]['images_only'] = images_only
                this_book[file_md5]['cover'] = cover
                this_book[file_md5]['annotations'] = annotations

            this_book[file_md5]['title'] = title
            this_book[file_md5]['author'] = author
            this_book[file_md5]['year'] = year
            this_book[file_md5]['isbn'] = isbn
            this_book[file_md5]['tags'] = tags

            return this_book

    def read_progress(self):
        while True:
            processed_file = self.queue.get()
            self.threading_completed.append(processed_file)

            total_number = len(self.file_list)
            completed_number = len(self.threading_completed)

            if progress_emitter:  # Skip update in reading mode
                progress_emitter.update_progress(completed_number * 100 //
                                                 total_number)

            if total_number == completed_number:
                break

    def initiate_threads(self):
        if not self.file_list:
            return None

        def pool_creator():
            _pool = Pool(5)
            self.processed_books = _pool.map(self.read_book, self.file_list)

            _pool.close()
            _pool.join()

        start_time = time.time()

        worker_thread = threading.Thread(target=pool_creator)
        progress_thread = threading.Thread(target=self.read_progress)
        worker_thread.start()
        progress_thread.start()

        worker_thread.join()
        progress_thread.join(timeout=.5)

        return_books = {}
        # Exclude None returns generated in case of duplication / parse errors
        self.processed_books = [i for i in self.processed_books if i]
        for i in self.processed_books:
            for j in i:
                return_books[j] = i[j]

        del self.processed_books
        print('Finished processing in', time.time() - start_time)
        return return_books
Exemplo n.º 3
0
class BookSorter:
    def __init__(self, file_list, mode, database_path, settings, temp_dir=None):
        # Have the GUI pass a list of files straight to here
        # Then, on the basis of what is needed, pass the
        # filenames to the requisite functions
        # This includes getting file info for the database
        # Parsing for the reader proper
        # Caching upon closing
        self.file_list = [i for i in file_list if os.path.exists(i)]
        self.statistics = [0, (len(file_list))]
        self.hashes_and_paths = {}
        self.work_mode = mode[0]
        self.addition_mode = mode[1]
        self.database_path = database_path
        self.auto_tags = settings['auto_tags']
        self.auto_cover = settings['auto_cover']
        self.temp_dir = temp_dir
        if database_path:
            self.database_hashes()

        self.threading_completed = []
        self.queue = Manager().Queue()
        self.errors = Manager().list()
        self.processed_books = []

        if self.work_mode == 'addition':
            progress_object_generator()

    def database_hashes(self):
        all_hashes_and_paths = database.DatabaseFunctions(
            self.database_path).fetch_data(
                ('Hash', 'Path'),
                'books',
                {'Hash': ''},
                'LIKE')

        if all_hashes_and_paths:
            self.hashes_and_paths = {
                i[0]: i[1] for i in all_hashes_and_paths}

    def database_entry_for_book(self, file_hash):
        database_return = database.DatabaseFunctions(
            self.database_path).fetch_data(
                ('Title', 'Author', 'Year', 'ISBN', 'Tags',
                 'Position', 'Bookmarks', 'CoverImage', 'Annotations'),
                'books',
                {'Hash': file_hash},
                'EQUALS')[0]

        book_data = []

        for count, i in enumerate(database_return):
            if count in (5, 6, 8):  # Position, Bookmarks, and Annotations are pickled
                if i:
                    book_data.append(pickle.loads(i))
                else:
                    book_data.append(None)
            else:
                book_data.append(i)

        return book_data

    def read_book(self, filename):
        # filename is expected as a string containg the
        # full path of the ebook file

        with open(filename, 'rb') as current_book:
            # This should speed up addition for larger files
            # without compromising the integrity of the process
            first_bytes = current_book.read(1024 * 32)  # First 32KB of the file
            file_md5 = hashlib.md5(first_bytes).hexdigest()

        # Update the progress queue
        self.queue.put(filename)

        # This should not get triggered in reading mode
        # IF the file is NOT being loaded into the reader

        # Do not allow addition in case the file
        # is already in the database and it remains at its original path
        if self.work_mode == 'addition' and file_md5 in self.hashes_and_paths:
            if (self.hashes_and_paths[file_md5] == filename
                    or os.path.exists(self.hashes_and_paths[file_md5])):

                if not self.hashes_and_paths[file_md5] == filename:
                    warning_string = (
                        f'{os.path.basename(filename)} is already in database')
                    logger.warning(warning_string)
                return

        # This allows for eliminating issues with filenames that have
        # a dot in them. All hail the roundabout fix.
        valid_extension = False
        for i in sorter:
            if os.path.basename(filename).endswith(i):
                file_extension = i
                valid_extension = True
                break

        if not valid_extension:
            this_error = 'Unsupported extension: ' + filename
            self.errors.append(this_error)
            logger.error(this_error)
            return

        book_ref = sorter[file_extension](filename, self.temp_dir, file_md5)

        # None of the following have an exception type specified
        # This will keep everything from crashing, but will make
        # troubleshooting difficult
        # TODO
        # In application notifications

        try:
            book_ref.read_book()
        except Exception as e:
            this_error = f'Error initializing: {filename}'
            self.errors.append(this_error)
            logger.exception(this_error + f' {type(e).__name__} Arguments: {e.args}')
            return

        this_book = {}
        this_book[file_md5] = {
            'hash': file_md5,
            'path': filename}

        # Different modes require different values
        if self.work_mode == 'addition':
            try:
                metadata = book_ref.generate_metadata()
            except Exception as e:
                this_error = f'Metadata generation error: {filename}'
                self.errors.append(this_error)
                logger.exception(this_error + f' {type(e).__name__} Arguments: {e.args}')
                return

            title = metadata.title
            author = metadata.author
            year = metadata.year
            isbn = metadata.isbn

            tags = None
            if self.auto_tags:
                tags = metadata.tags

            cover_image_raw = metadata.cover
            if cover_image_raw:
                cover_image = resize_image(cover_image_raw)
            else:
                cover_image = None
                if self.auto_cover:
                    cover_image = fetch_cover(title, author)

            this_book[file_md5]['cover_image'] = cover_image
            this_book[file_md5]['addition_mode'] = self.addition_mode

        if self.work_mode == 'reading':
            try:
                book_breakdown = book_ref.generate_content()
            except Exception as e:
                this_error = f'Content generation error: {filename}'
                self.errors.append(this_error)
                logger.exception(this_error + f' {type(e).__name__} Arguments: {e.args}')
                return

            toc = book_breakdown[0]
            content = book_breakdown[1]
            images_only = book_breakdown[2]

            try:
                book_data = self.database_entry_for_book(file_md5)
            except TypeError:
                logger.error(
                    f'Database error: {filename}. Re-add book to program')
                return

            title = book_data[0].replace('&', '&&')
            author = book_data[1]
            year = book_data[2]
            isbn = book_data[3]
            tags = book_data[4]
            position = book_data[5]
            bookmarks = book_data[6]
            cover = book_data[7]
            annotations = book_data[8]

            this_book[file_md5]['position'] = position
            this_book[file_md5]['bookmarks'] = bookmarks
            this_book[file_md5]['toc'] = toc
            this_book[file_md5]['content'] = content
            this_book[file_md5]['images_only'] = images_only
            this_book[file_md5]['cover'] = cover
            this_book[file_md5]['annotations'] = annotations

        this_book[file_md5]['title'] = title
        this_book[file_md5]['author'] = author
        this_book[file_md5]['year'] = year
        this_book[file_md5]['isbn'] = isbn
        this_book[file_md5]['tags'] = tags

        return this_book

    def read_progress(self):
        while True:
            processed_file = self.queue.get()
            self.threading_completed.append(processed_file)

            total_number = len(self.file_list)
            completed_number = len(self.threading_completed)

            # Just for the record, this slows down book searching by about 20%
            if _progress_emitter:  # Skip update in reading mode
                _progress_emitter.update_progress(
                    completed_number * 100 // total_number)

            if total_number == completed_number:
                break

    def initiate_threads(self):
        if not self.file_list:
            return None

        def pool_creator():
            _pool = Pool(thread_count)
            self.processed_books = _pool.map(
                self.read_book, self.file_list)

            _pool.close()
            _pool.join()

        start_time = time.time()

        worker_thread = threading.Thread(target=pool_creator)
        progress_thread = threading.Thread(target=self.read_progress)
        worker_thread.start()
        progress_thread.start()

        worker_thread.join()
        progress_thread.join(timeout=.5)

        return_books = {}
        # Exclude None returns generated in case of duplication / parse errors
        self.processed_books = [i for i in self.processed_books if i]
        for i in self.processed_books:
            for j in i:
                return_books[j] = i[j]

        del self.processed_books
        processing_time = str(time.time() - start_time)
        logger.info('Finished processing in ' + processing_time)

        return return_books, self.errors