示例#1
0
def FleschKincaidGradeLevel(text):
    sentences = []
    filtered_words = []
    syllableCount = 0
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    for line in text:
        line.lower()
        words = []
        words = TOKENIZER.tokenize(line)

        for word in words:
            if word in SPECIAL_CHARS:
                pass
            else:
                strip_word = word.strip()
                new_word = strip_word.replace(",", "").replace(".", "")
                new_word = new_word.replace("!", "").replace("?", "")
                if new_word != "" and new_word != " " and new_word != "  ":
                    filtered_words.append(new_word)
            for char in word:
                syllableCount += syllables_en.count(char)
        sentences.append(tokenizer.tokenize(line))  # .decode('utf-8')
    word_count = len(filtered_words)
    sentence_count = len(sentences)
    syllable_count = syllableCount
    avg_words_p_sentence = word_count / sentence_count
    score = 0.0
    if float(word_count) > 0.0:
        score = 0.39 * (float(avg_words_p_sentence)) + 11.8 * (float(syllable_count) / float(word_count)) - 15.59
    return round(score, 4)
def count_syllables(words):
	for w in words:
		if w in exclude:
			continue
		s = syllables_en.count(w)
		if s > 7: # probably a latex thing and not a word
			continue
		yield s
示例#3
0
def count_syllables(words):
    for w in words:
        if w in exclude:
            continue
        s = syllables_en.count(w)
        if s > 7:  # probably a latex thing and not a word
            continue
        yield s
示例#4
0
def count_syllables(word):
    """Given a a word as a string, return the number of syllables in that word,
    as best we can."""
    if word in PRONUNCIATIONS:
        try:
            return count_syllables_phonemes(PRONUNCIATIONS[word][0])
        except: pass
    return syllables_en.count(word)
示例#5
0
def count_syllables(word):
    """Given a a word as a string, return the number of syllables in that word,
    as best we can."""
    if word in PRONUNCIATIONS:
        try:
            return count_syllables_phonemes(PRONUNCIATIONS[word][0])
        except:
            pass
    return syllables_en.count(word)
示例#6
0
def count_syllables(words, lang):
    syllableCount = 0
    for word in words:
        if lang == 'pt':
            try:
                syllableCount += syllables_pt.count(word)
            except UnicodeDecodeError:
                syllableCount += syllables_pt.count(word.decode('utf8', 'ignore'))
        else:
            syllableCount += syllables_en.count(word)
    return syllableCount
示例#7
0
def count_syllables(word):
    """Given a a word as a string, return the number of syllables in that word,
    as best we can."""

    # special case for Russian, or perhaps other languages that use Cyrillic.
    if cyrillic(word):
        return count_syllables_cyrillic(word)

    if word in PRONUNCIATIONS:
        try:
            return count_syllables_phonemes(PRONUNCIATIONS[word][0])
        except:
            pass
    return syllables_en.count(word)
示例#8
0
def count_syllables(word):
    """Given a a word as a string, return the number of syllables in that word,
    as best we can."""

    # special case for Russian, or perhaps other languages that use Cyrillic.
    if cyrillic(word):
        return count_syllables_cyrillic(word)

    if word in PRONUNCIATIONS:
        try:
            return count_syllables_phonemes(PRONUNCIATIONS[word][0])
        except:
            pass
    return syllables_en.count(word)
示例#9
0
def syllable_stats(words):
    totsyl = 0
    polysylcount = 0
    complexwords = 0
    for w in words:
        if w in exclude:
            continue
        s = syllables_en.count(w)

        if s > 7:  # probably a latex thing and not a word
            continue
        totsyl += s
        if s >= 3:
            polysylcount += s
            complex_s = s
            # complex words are not nouns, have >= 3 syl, not counting common endings
            # (and are not compound words, not checked here)
            if any([w.endswith(ending) for ending in ('es', 'ed', 'ing')]):
                complex_s = s - 1
            if complex_s >= 3 and w[0].islower() and w not in easy_word_set:
                complexwords += 1
    return totsyl, polysylcount, complexwords
def syllable_stats(words):
	totsyl = 0
	polysylcount = 0
	complexwords = 0
	for w in words:
		if w in exclude:
			continue
		s = syllables_en.count(w)
		
		if s > 7: # probably a latex thing and not a word
			continue
		totsyl += s
		if s >= 3:
			polysylcount += s
			complex_s = s
			# complex words are not nouns, have >= 3 syl, not counting common endings
			# (and are not compound words, not checked here)
			if any([w.endswith(ending) for ending in ('es', 'ed', 'ing')]):
				complex_s = s - 1
			if complex_s >= 3 and w[0].islower() and w not in easy_word_set:
				complexwords += 1
	return totsyl, polysylcount, complexwords
def load_book_features(filename, smartStopWords={}, pronSet={}, conjSet={}):
    '''
    Load features for each book in the corpus. There are 4 + RANGE*4 features
    for each instance. These features are:
       ---------------------------------------------------------------------------------------------------------
       No. Feature Name                                                                         No. of features.
       ---------------------------------------------------------------------------------------------------------
       1.  number of hapax legomena divided by number of unique words                           1
       2.  number of dis legomena divided by number of unique words                             1
       3.  number of unique words divided by number of total words                              1
       4.  flesch readability score divided by 100                                              1

       5.  no. of sentences of length in the range [1, RANGE] divided by the                    RANGE
           number of total sentences
       6.  no. of words of length in the range [1, RANGE] divided by the                        RANGE
           number of total words
       7.  no. of nominative pronouns per sentence in the range [1, RANGE] divided by the       RANGE
           number of total sentences
       8.  no. of (coordinating + subordinating) conjunctions per sentence in the range         RANGE
           [1, RANGE] divided by the number of total sentences
    '''

    text = extract_book_contents(open(filename, 'r').read()).lower()

    contents = re.sub('\'s|(\r\n)|-+|["_]', ' ',
                      text)  # remove \r\n, apostrophes, and dashes
    sentenceList = sent_tokenize(contents.strip())

    cleanWords = []
    sentenceLenDist = []
    pronDist = []
    conjDist = []
    sentences = []
    totalWords = 0
    wordLenDist = []
    totalSyllables = 0
    for sentence in sentenceList:
        if sentence != ".":
            pronCount = 0
            conjCount = 0
            sentences.append(sentence)
            sentenceWords = re.findall(r"[\w']+", sentence)
            totalWords += len(sentenceWords)  # record all words in sentence
            sentenceLenDist.append(
                len(sentenceWords))  # record length of sentence in words
            for word in sentenceWords:
                totalSyllables += count(word)
                wordLenDist.append(len(word))  # record length of word in chars
                if word in pronSet:
                    pronCount += 1  # record no. of pronouns in sentence
                if word in conjSet:
                    conjCount += 1  # record no. of conjunctions in sentence
                if word not in smartStopWords:
                    cleanWords.append(word)
            pronDist.append(pronCount)
            conjDist.append(conjCount)

    sentenceLengthFreqDist = FreqDist(sentenceLenDist)
    sentenceLengthDist = map(lambda x: sentenceLengthFreqDist.freq(x),
                             range(1, RANGE))
    sentenceLengthDist.append(1 - sum(sentenceLengthDist))

    pronounFreqDist = FreqDist(pronDist)
    pronounDist = map(lambda x: pronounFreqDist.freq(x), range(1, RANGE))
    pronounDist.append(1 - sum(pronounDist))

    conjunctionFreqDist = FreqDist(conjDist)
    conjunctionDist = map(lambda x: conjunctionFreqDist.freq(x),
                          range(1, RANGE))
    conjunctionDist.append(1 - sum(conjunctionDist))

    wordLengthFreqDist = FreqDist(wordLenDist)
    wordLengthDist = map(lambda x: wordLengthFreqDist.freq(x), range(1, RANGE))
    wordLengthDist.append(1 - sum(wordLengthDist))

    # calculate readability
    avgSentenceLength = np.mean(sentenceLenDist)
    avgSyllablesPerWord = float(totalSyllables) / totalWords
    readability = float(206.835 - (1.015 * avgSentenceLength) -
                        (84.6 * avgSyllablesPerWord)) / 100

    wordsFreqDist = MyFreqDist(FreqDist(cleanWords))
    #sentenceDist = FreqDist(sentences)
    #print sentenceDist.keys()[:15] # most common sentences
    #print wordsFreqDist.keys()[:15] # most common words
    #print wordsFreqDist.keys()[-15:] # most UNcommon words

    numUniqueWords = len(wordsFreqDist.keys())
    numTotalWords = len(cleanWords)

    hapax = float(len(wordsFreqDist.hapaxes(
    ))) / numUniqueWords  # no. words occurring once / total num. UNIQUE words
    dis = float(len(wordsFreqDist.dises(
    ))) / numUniqueWords  # no. words occurring twice / total num. UNIQUE words
    richness = float(
        numUniqueWords) / numTotalWords  # no. unique words / total num. words

    result = []
    result.append(hapax)
    result.append(dis)
    result.append(richness)
    result.append(readability)
    result.extend(sentenceLengthDist)
    result.extend(wordLengthDist)
    result.extend(pronounDist)
    result.extend(conjunctionDist)

    return result, numTotalWords
def load_book_features(filename, smartStopWords={}, pronSet={}, conjSet={}):
    '''
    Load features for each book in the corpus. There are 4 + RANGE*4 features
    for each instance. These features are:
       ---------------------------------------------------------------------------------------------------------
       No. Feature Name                                                                         No. of features.
       ---------------------------------------------------------------------------------------------------------
       1.  number of hapax legomena divided by number of unique words                           1
       2.  number of dis legomena divided by number of unique words                             1
       3.  number of unique words divided by number of total words                              1
       4.  flesch readability score divided by 100                                              1

       5.  no. of sentences of length in the range [1, RANGE] divided by the                    RANGE
           number of total sentences
       6.  no. of words of length in the range [1, RANGE] divided by the                        RANGE
           number of total words
       7.  no. of nominative pronouns per sentence in the range [1, RANGE] divided by the       RANGE
           number of total sentences
       8.  no. of (coordinating + subordinating) conjunctions per sentence in the range         RANGE
           [1, RANGE] divided by the number of total sentences
    '''

    text = extract_book_contents(open(filename, 'r').read()).lower()

    contents = re.sub('\'s|(\r\n)|-+|["_]', ' ', text) # remove \r\n, apostrophes, and dashes
    sentenceList = sent_tokenize(contents.strip())

    cleanWords = []
    sentenceLenDist = []
    pronDist = []
    conjDist = []
    sentences = []
    totalWords = 0
    wordLenDist = []
    totalSyllables = 0
    for sentence in sentenceList:
        if sentence != ".":
            pronCount = 0
            conjCount = 0
            sentences.append(sentence)
            sentenceWords = re.findall(r"[\w']+", sentence)
            totalWords += len(sentenceWords) # record all words in sentence
            sentenceLenDist.append(len(sentenceWords)) # record length of sentence in words
            for word in sentenceWords:
                totalSyllables += count(word)
                wordLenDist.append(len(word)) # record length of word in chars
                if word in pronSet:
                    pronCount+=1 # record no. of pronouns in sentence
                if word in conjSet:
                    conjCount+=1 # record no. of conjunctions in sentence
                if word not in smartStopWords:
                    cleanWords.append(word)
            pronDist.append(pronCount)
            conjDist.append(conjCount)

    sentenceLengthFreqDist = FreqDist(sentenceLenDist)
    sentenceLengthDist = map(lambda x: sentenceLengthFreqDist.freq(x), range(1, RANGE))
    sentenceLengthDist.append(1-sum(sentenceLengthDist))

    pronounFreqDist = FreqDist(pronDist)
    pronounDist = map(lambda x: pronounFreqDist.freq(x), range(1, RANGE))
    pronounDist.append(1-sum(pronounDist))

    conjunctionFreqDist = FreqDist(conjDist)
    conjunctionDist = map(lambda x: conjunctionFreqDist.freq(x), range(1, RANGE))
    conjunctionDist.append(1-sum(conjunctionDist))

    wordLengthFreqDist= FreqDist(wordLenDist)
    wordLengthDist = map(lambda x: wordLengthFreqDist.freq(x), range(1, RANGE))
    wordLengthDist.append(1-sum(wordLengthDist))

    # calculate readability
    avgSentenceLength = np.mean(sentenceLenDist)
    avgSyllablesPerWord = float(totalSyllables)/totalWords
    readability = float(206.835 - (1.015 * avgSentenceLength) - (84.6 * avgSyllablesPerWord))/100

    wordsFreqDist = MyFreqDist(FreqDist(cleanWords))
    #sentenceDist = FreqDist(sentences)
    #print sentenceDist.keys()[:15] # most common sentences
    #print wordsFreqDist.keys()[:15] # most common words
    #print wordsFreqDist.keys()[-15:] # most UNcommon words

    numUniqueWords = len(wordsFreqDist.keys())
    numTotalWords = len(cleanWords)

    hapax = float(len(wordsFreqDist.hapaxes()))/numUniqueWords # no. words occurring once / total num. UNIQUE words
    dis = float(len(wordsFreqDist.dises()))/numUniqueWords # no. words occurring twice / total num. UNIQUE words
    richness = float(numUniqueWords)/numTotalWords # no. unique words / total num. words

    result = []
    result.append(hapax)
    result.append(dis)
    result.append(richness)
    result.append(readability)
    result.extend(sentenceLengthDist)
    result.extend(wordLengthDist)
    result.extend(pronounDist)
    result.extend(conjunctionDist)

    return result, numTotalWords
示例#13
0
def count_syllables(words):
    syllableCount = 0
    for word in words:
        syllableCount += syllables_en.count(word)
    return syllableCount
示例#14
0
def load_book_features(filename, smartStopWords={}, pronSet={}, conjSet={}):
    RANGE = 25
    text = extract_book_contents(open(filename, 'r').read()).lower()
    contents = re.sub('\'s|(\r\n)|-+|["_]', ' ',
                      text)  # remove \r\n, apostrophes, and dashes
    sentenceList = sent_tokenize(contents.strip())
    cleanWords = []
    sentenceLenDist = []
    pronDist = []
    conjDist = []
    sentences = []
    totalWords = 0
    wordLenDist = []
    totalSyllables = 0
    for sentence in sentenceList:
        if sentence != ".":
            pronCount = 0
            conjCount = 0
            sentences.append(sentence)
            sentenceWords = re.findall(r"[\w']+", sentence)
            totalWords += len(sentenceWords)
            sentenceLenDist.append(len(sentenceWords))
            for word in sentenceWords:
                totalSyllables += count(word)
                wordLenDist.append(len(word))
                if word in pronSet:
                    pronCount += 1
                if word in conjSet:
                    conjCount += 1
                if word not in smartStopWords:
                    cleanWords.append(word)
            pronDist.append(pronCount)
            conjDist.append(conjCount)

    sentenceLengthFreqDist = FreqDist(sentenceLenDist)
    sentenceLengthDist = list(
        map(lambda x: sentenceLengthFreqDist.freq(x), range(1, RANGE)))
    sentenceLengthDist.append(1 - sum(sentenceLengthDist))

    pronounFreqDist = FreqDist(pronDist)
    pronounDist = list(map(lambda x: pronounFreqDist.freq(x), range(1, RANGE)))
    pronounDist.append(1 - sum(pronounDist))

    conjunctionFreqDist = FreqDist(conjDist)
    conjunctionDist = list(
        map(lambda x: conjunctionFreqDist.freq(x), range(1, RANGE)))
    conjunctionDist.append(1 - sum(conjunctionDist))

    wordLengthFreqDist = FreqDist(wordLenDist)
    wordLengthDist = list(
        map(lambda x: wordLengthFreqDist.freq(x), range(1, RANGE)))
    wordLengthDist.append(1 - sum(wordLengthDist))

    avgSentenceLength = np.mean(sentenceLenDist)
    avgSyllablesPerWord = float(totalSyllables) / totalWords
    readability = float(206.835 - (1.015 * avgSentenceLength) -
                        (84.6 * avgSyllablesPerWord)) / 100

    wordsFreqDist = MyFreqDist(FreqDist(cleanWords))

    numUniqueWords = len(wordsFreqDist.keys())
    numTotalWords = len(cleanWords)

    hapax = float(len(wordsFreqDist.hapaxes())) / numUniqueWords
    dis = float(len(wordsFreqDist.dises())) / numUniqueWords
    richness = float(numUniqueWords) / numTotalWords

    result = []
    result.append(hapax)
    result.append(dis)
    result.append(richness)
    result.append(readability)
    result.extend(sentenceLengthDist)
    result.extend(wordLengthDist)
    result.extend(pronounDist)
    result.extend(conjunctionDist)
    return result, numTotalWords
示例#15
0
def my(word):
    phones = pronouncing.phones_for_word(word)
    if phones:
        return pronouncing.syllable_count(phones[0])
    else:
        return syllables_en.count(word)
示例#16
0
def syllables_count(word):
    return syllables_en.count(word)
示例#17
0
def count_syllables(words):
    syllableCount = 0
    for word in words:
        syllableCount += syllables_en.count(word)
    return syllableCount
示例#18
0
	def count_syllables(self, word):
	    return syllables_en.count(word)
示例#19
0
def CountSyllables(word, isName=True):
    return syllables_en.count(word)
示例#20
0
def my(word):
    phones = pronouncing.phones_for_word(word)
    if phones:
        return pronouncing.syllable_count(phones[0])
    else:
        return syllables_en.count(word)