def _get_epub_standard_word_count(iterator, lang='en'): ''' This algorithm counts individual words instead of pages ''' book_text = _read_epub_contents(iterator, strip_html=True) try: from calibre.spell.break_iterator import count_words wordcount = count_words(book_text, lang) logger.debug('\tWord count - count_words method:%s' % wordcount) except: try: # The above method is new and no-one will have it as of 08/01/2016. Use an older method for a beta. from calibre.spell.break_iterator import split_into_words_and_positions wordcount = len(split_into_words_and_positions(book_text, lang)) logger.debug( '\tWord count - split_into_words_and_positions method:%s' % wordcount) except: from calibre.utils.wordcount import get_wordcount_obj wordcount = get_wordcount_obj(book_text) wordcount = wordcount.words logger.debug('\tWord count - old method:%s' % wordcount) return wordcount
def _get_epub_standard_word_count(iterator): ''' This algorithm counts individual words instead of pages ''' from calibre.utils.wordcount import get_wordcount_obj book_text = _read_epub_contents(iterator, strip_html=True) wordcount = get_wordcount_obj(book_text) return wordcount.words
def _get_epub_standard_word_count(iterator, lang='en'): ''' This algorithm counts individual words instead of pages ''' book_text = _read_epub_contents(iterator, strip_html=True) try: from calibre.spell.break_iterator import count_words wordcount = count_words(book_text, lang) logger.debug('\tWord count - count_words method:%s'%wordcount) except: try: # The above method is new and no-one will have it as of 08/01/2016. Use an older method for a beta. from calibre.spell.break_iterator import split_into_words_and_positions wordcount = len(split_into_words_and_positions(book_text, lang)) logger.debug('\tWord count - split_into_words_and_positions method:%s'%wordcount) except: from calibre.utils.wordcount import get_wordcount_obj wordcount = get_wordcount_obj(book_text) wordcount = wordcount.words logger.debug('\tWord count - old method:%s'%wordcount) return wordcount
def get_word_count(self, html): word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html) word_count_text = re.sub(r'<[^>]*>', '', word_count_text) wordcount = get_wordcount_obj(word_count_text) return wordcount.words