def FleschKincaidGradeLevel(text): sentences = [] filtered_words = [] syllableCount = 0 tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') for line in text: line.lower() words = [] words = TOKENIZER.tokenize(line) for word in words: if word in SPECIAL_CHARS: pass else: strip_word = word.strip() new_word = strip_word.replace(",", "").replace(".", "") new_word = new_word.replace("!", "").replace("?", "") if new_word != "" and new_word != " " and new_word != " ": filtered_words.append(new_word) for char in word: syllableCount += syllables_en.count(char) sentences.append(tokenizer.tokenize(line)) # .decode('utf-8') word_count = len(filtered_words) sentence_count = len(sentences) syllable_count = syllableCount avg_words_p_sentence = word_count / sentence_count score = 0.0 if float(word_count) > 0.0: score = 0.39 * (float(avg_words_p_sentence)) + 11.8 * (float(syllable_count) / float(word_count)) - 15.59 return round(score, 4)
def count_syllables(words): for w in words: if w in exclude: continue s = syllables_en.count(w) if s > 7: # probably a latex thing and not a word continue yield s
def count_syllables(word): """Given a a word as a string, return the number of syllables in that word, as best we can.""" if word in PRONUNCIATIONS: try: return count_syllables_phonemes(PRONUNCIATIONS[word][0]) except: pass return syllables_en.count(word)
def count_syllables(words, lang): syllableCount = 0 for word in words: if lang == 'pt': try: syllableCount += syllables_pt.count(word) except UnicodeDecodeError: syllableCount += syllables_pt.count(word.decode('utf8', 'ignore')) else: syllableCount += syllables_en.count(word) return syllableCount
def count_syllables(word): """Given a a word as a string, return the number of syllables in that word, as best we can.""" # special case for Russian, or perhaps other languages that use Cyrillic. if cyrillic(word): return count_syllables_cyrillic(word) if word in PRONUNCIATIONS: try: return count_syllables_phonemes(PRONUNCIATIONS[word][0]) except: pass return syllables_en.count(word)
def syllable_stats(words): totsyl = 0 polysylcount = 0 complexwords = 0 for w in words: if w in exclude: continue s = syllables_en.count(w) if s > 7: # probably a latex thing and not a word continue totsyl += s if s >= 3: polysylcount += s complex_s = s # complex words are not nouns, have >= 3 syl, not counting common endings # (and are not compound words, not checked here) if any([w.endswith(ending) for ending in ('es', 'ed', 'ing')]): complex_s = s - 1 if complex_s >= 3 and w[0].islower() and w not in easy_word_set: complexwords += 1 return totsyl, polysylcount, complexwords
def load_book_features(filename, smartStopWords={}, pronSet={}, conjSet={}): ''' Load features for each book in the corpus. There are 4 + RANGE*4 features for each instance. These features are: --------------------------------------------------------------------------------------------------------- No. Feature Name No. of features. --------------------------------------------------------------------------------------------------------- 1. number of hapax legomena divided by number of unique words 1 2. number of dis legomena divided by number of unique words 1 3. number of unique words divided by number of total words 1 4. flesch readability score divided by 100 1 5. no. of sentences of length in the range [1, RANGE] divided by the RANGE number of total sentences 6. no. of words of length in the range [1, RANGE] divided by the RANGE number of total words 7. no. of nominative pronouns per sentence in the range [1, RANGE] divided by the RANGE number of total sentences 8. no. of (coordinating + subordinating) conjunctions per sentence in the range RANGE [1, RANGE] divided by the number of total sentences ''' text = extract_book_contents(open(filename, 'r').read()).lower() contents = re.sub('\'s|(\r\n)|-+|["_]', ' ', text) # remove \r\n, apostrophes, and dashes sentenceList = sent_tokenize(contents.strip()) cleanWords = [] sentenceLenDist = [] pronDist = [] conjDist = [] sentences = [] totalWords = 0 wordLenDist = [] totalSyllables = 0 for sentence in sentenceList: if sentence != ".": pronCount = 0 conjCount = 0 sentences.append(sentence) sentenceWords = re.findall(r"[\w']+", sentence) totalWords += len(sentenceWords) # record all words in sentence sentenceLenDist.append( len(sentenceWords)) # record length of sentence in words for word in sentenceWords: totalSyllables += count(word) wordLenDist.append(len(word)) # record length of word in chars if word in pronSet: pronCount += 1 # record no. of pronouns in sentence if word in conjSet: conjCount += 1 # record no. of conjunctions in sentence if word not in smartStopWords: cleanWords.append(word) pronDist.append(pronCount) conjDist.append(conjCount) sentenceLengthFreqDist = FreqDist(sentenceLenDist) sentenceLengthDist = map(lambda x: sentenceLengthFreqDist.freq(x), range(1, RANGE)) sentenceLengthDist.append(1 - sum(sentenceLengthDist)) pronounFreqDist = FreqDist(pronDist) pronounDist = map(lambda x: pronounFreqDist.freq(x), range(1, RANGE)) pronounDist.append(1 - sum(pronounDist)) conjunctionFreqDist = FreqDist(conjDist) conjunctionDist = map(lambda x: conjunctionFreqDist.freq(x), range(1, RANGE)) conjunctionDist.append(1 - sum(conjunctionDist)) wordLengthFreqDist = FreqDist(wordLenDist) wordLengthDist = map(lambda x: wordLengthFreqDist.freq(x), range(1, RANGE)) wordLengthDist.append(1 - sum(wordLengthDist)) # calculate readability avgSentenceLength = np.mean(sentenceLenDist) avgSyllablesPerWord = float(totalSyllables) / totalWords readability = float(206.835 - (1.015 * avgSentenceLength) - (84.6 * avgSyllablesPerWord)) / 100 wordsFreqDist = MyFreqDist(FreqDist(cleanWords)) #sentenceDist = FreqDist(sentences) #print sentenceDist.keys()[:15] # most common sentences #print wordsFreqDist.keys()[:15] # most common words #print wordsFreqDist.keys()[-15:] # most UNcommon words numUniqueWords = len(wordsFreqDist.keys()) numTotalWords = len(cleanWords) hapax = float(len(wordsFreqDist.hapaxes( ))) / numUniqueWords # no. words occurring once / total num. UNIQUE words dis = float(len(wordsFreqDist.dises( ))) / numUniqueWords # no. words occurring twice / total num. UNIQUE words richness = float( numUniqueWords) / numTotalWords # no. unique words / total num. words result = [] result.append(hapax) result.append(dis) result.append(richness) result.append(readability) result.extend(sentenceLengthDist) result.extend(wordLengthDist) result.extend(pronounDist) result.extend(conjunctionDist) return result, numTotalWords
def load_book_features(filename, smartStopWords={}, pronSet={}, conjSet={}): ''' Load features for each book in the corpus. There are 4 + RANGE*4 features for each instance. These features are: --------------------------------------------------------------------------------------------------------- No. Feature Name No. of features. --------------------------------------------------------------------------------------------------------- 1. number of hapax legomena divided by number of unique words 1 2. number of dis legomena divided by number of unique words 1 3. number of unique words divided by number of total words 1 4. flesch readability score divided by 100 1 5. no. of sentences of length in the range [1, RANGE] divided by the RANGE number of total sentences 6. no. of words of length in the range [1, RANGE] divided by the RANGE number of total words 7. no. of nominative pronouns per sentence in the range [1, RANGE] divided by the RANGE number of total sentences 8. no. of (coordinating + subordinating) conjunctions per sentence in the range RANGE [1, RANGE] divided by the number of total sentences ''' text = extract_book_contents(open(filename, 'r').read()).lower() contents = re.sub('\'s|(\r\n)|-+|["_]', ' ', text) # remove \r\n, apostrophes, and dashes sentenceList = sent_tokenize(contents.strip()) cleanWords = [] sentenceLenDist = [] pronDist = [] conjDist = [] sentences = [] totalWords = 0 wordLenDist = [] totalSyllables = 0 for sentence in sentenceList: if sentence != ".": pronCount = 0 conjCount = 0 sentences.append(sentence) sentenceWords = re.findall(r"[\w']+", sentence) totalWords += len(sentenceWords) # record all words in sentence sentenceLenDist.append(len(sentenceWords)) # record length of sentence in words for word in sentenceWords: totalSyllables += count(word) wordLenDist.append(len(word)) # record length of word in chars if word in pronSet: pronCount+=1 # record no. of pronouns in sentence if word in conjSet: conjCount+=1 # record no. of conjunctions in sentence if word not in smartStopWords: cleanWords.append(word) pronDist.append(pronCount) conjDist.append(conjCount) sentenceLengthFreqDist = FreqDist(sentenceLenDist) sentenceLengthDist = map(lambda x: sentenceLengthFreqDist.freq(x), range(1, RANGE)) sentenceLengthDist.append(1-sum(sentenceLengthDist)) pronounFreqDist = FreqDist(pronDist) pronounDist = map(lambda x: pronounFreqDist.freq(x), range(1, RANGE)) pronounDist.append(1-sum(pronounDist)) conjunctionFreqDist = FreqDist(conjDist) conjunctionDist = map(lambda x: conjunctionFreqDist.freq(x), range(1, RANGE)) conjunctionDist.append(1-sum(conjunctionDist)) wordLengthFreqDist= FreqDist(wordLenDist) wordLengthDist = map(lambda x: wordLengthFreqDist.freq(x), range(1, RANGE)) wordLengthDist.append(1-sum(wordLengthDist)) # calculate readability avgSentenceLength = np.mean(sentenceLenDist) avgSyllablesPerWord = float(totalSyllables)/totalWords readability = float(206.835 - (1.015 * avgSentenceLength) - (84.6 * avgSyllablesPerWord))/100 wordsFreqDist = MyFreqDist(FreqDist(cleanWords)) #sentenceDist = FreqDist(sentences) #print sentenceDist.keys()[:15] # most common sentences #print wordsFreqDist.keys()[:15] # most common words #print wordsFreqDist.keys()[-15:] # most UNcommon words numUniqueWords = len(wordsFreqDist.keys()) numTotalWords = len(cleanWords) hapax = float(len(wordsFreqDist.hapaxes()))/numUniqueWords # no. words occurring once / total num. UNIQUE words dis = float(len(wordsFreqDist.dises()))/numUniqueWords # no. words occurring twice / total num. UNIQUE words richness = float(numUniqueWords)/numTotalWords # no. unique words / total num. words result = [] result.append(hapax) result.append(dis) result.append(richness) result.append(readability) result.extend(sentenceLengthDist) result.extend(wordLengthDist) result.extend(pronounDist) result.extend(conjunctionDist) return result, numTotalWords
def count_syllables(words): syllableCount = 0 for word in words: syllableCount += syllables_en.count(word) return syllableCount
def load_book_features(filename, smartStopWords={}, pronSet={}, conjSet={}): RANGE = 25 text = extract_book_contents(open(filename, 'r').read()).lower() contents = re.sub('\'s|(\r\n)|-+|["_]', ' ', text) # remove \r\n, apostrophes, and dashes sentenceList = sent_tokenize(contents.strip()) cleanWords = [] sentenceLenDist = [] pronDist = [] conjDist = [] sentences = [] totalWords = 0 wordLenDist = [] totalSyllables = 0 for sentence in sentenceList: if sentence != ".": pronCount = 0 conjCount = 0 sentences.append(sentence) sentenceWords = re.findall(r"[\w']+", sentence) totalWords += len(sentenceWords) sentenceLenDist.append(len(sentenceWords)) for word in sentenceWords: totalSyllables += count(word) wordLenDist.append(len(word)) if word in pronSet: pronCount += 1 if word in conjSet: conjCount += 1 if word not in smartStopWords: cleanWords.append(word) pronDist.append(pronCount) conjDist.append(conjCount) sentenceLengthFreqDist = FreqDist(sentenceLenDist) sentenceLengthDist = list( map(lambda x: sentenceLengthFreqDist.freq(x), range(1, RANGE))) sentenceLengthDist.append(1 - sum(sentenceLengthDist)) pronounFreqDist = FreqDist(pronDist) pronounDist = list(map(lambda x: pronounFreqDist.freq(x), range(1, RANGE))) pronounDist.append(1 - sum(pronounDist)) conjunctionFreqDist = FreqDist(conjDist) conjunctionDist = list( map(lambda x: conjunctionFreqDist.freq(x), range(1, RANGE))) conjunctionDist.append(1 - sum(conjunctionDist)) wordLengthFreqDist = FreqDist(wordLenDist) wordLengthDist = list( map(lambda x: wordLengthFreqDist.freq(x), range(1, RANGE))) wordLengthDist.append(1 - sum(wordLengthDist)) avgSentenceLength = np.mean(sentenceLenDist) avgSyllablesPerWord = float(totalSyllables) / totalWords readability = float(206.835 - (1.015 * avgSentenceLength) - (84.6 * avgSyllablesPerWord)) / 100 wordsFreqDist = MyFreqDist(FreqDist(cleanWords)) numUniqueWords = len(wordsFreqDist.keys()) numTotalWords = len(cleanWords) hapax = float(len(wordsFreqDist.hapaxes())) / numUniqueWords dis = float(len(wordsFreqDist.dises())) / numUniqueWords richness = float(numUniqueWords) / numTotalWords result = [] result.append(hapax) result.append(dis) result.append(richness) result.append(readability) result.extend(sentenceLengthDist) result.extend(wordLengthDist) result.extend(pronounDist) result.extend(conjunctionDist) return result, numTotalWords
def my(word): phones = pronouncing.phones_for_word(word) if phones: return pronouncing.syllable_count(phones[0]) else: return syllables_en.count(word)
def syllables_count(word): return syllables_en.count(word)
def count_syllables(self, word): return syllables_en.count(word)
def CountSyllables(word, isName=True): return syllables_en.count(word)