def test_lru_caching(self): # Clear any cache textstat.sentence_count._cache.clear() textstat.avg_sentence_length._cache.clear() # Make a call that uses `sentence_count` textstat.avg_sentence_length(self.long_test) # Test that `sentence_count` was called self.assertEqual(textstat.sentence_count._cache.misses, 1) # Call `avg_sentence_length` again textstat.avg_sentence_length(self.long_test) # Test that `sentence_count` wasn't called again self.assertEqual(textstat.sentence_count._cache.lookups, 1)
def test_lru_caching(): # Clear any cache textstat.sentence_count._cache.clear() textstat.avg_sentence_length._cache.clear() # Make a call that uses `sentence_count` textstat.avg_sentence_length(long_test) # Test that `sentence_count` was called assert textstat.sentence_count._cache.misses == 1 # Call `avg_sentence_length` again textstat.avg_sentence_length(long_test) # Test that `sentence_count` wasn't called again assert textstat.sentence_count._cache.lookups == 1
def transform(self, X): """ Transform X into a new dataset, Xprime and return it. """ X = pd.DataFrame(X) def countCaps(comment): count = 0 for c in comment: if c.isupper(): count += 1 return round(count * 100 / len(comment), 2) X['%OfUpperCaseLetters'] = X['Comment'].apply(countCaps) pattern = 'https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}' X['NoOfURL'] = X['Comment'].apply( lambda x: len(re.findall(pattern, x))) X['NoOfWords'] = X['Comment'].apply(lambda x: (len(word_tokenize(x)))) X['AvgSentenceLength'] = X['Comment'].apply( lambda x: textstat.avg_sentence_length(x)) X['TextStandard'] = X['Comment'].apply( lambda x: textstat.text_standard(x, float_output=True)) self.X = X return self.X
def statistics(self, text): self.asl = textstat.avg_sentence_length(text) self.avg_sentence_per_word = textstat.avg_sentence_per_word(text) self.avg_syllables_per_word = textstat.avg_syllables_per_word(text) self.difficult_words = textstat.difficult_words(text) self.lexicon_count = textstat.lexicon_count(text) self.polysyllable_count = textstat.polysyllabcount(text) self.sentence_count = textstat.sentence_count(text)
def test_lru_caching(): textstat.set_lang("en_US") # Clear any cache textstat.sentence_count.cache_clear() textstat.avg_sentence_length.cache_clear() # Make a call that uses `sentence_count` textstat.avg_sentence_length(long_test) # Test that `sentence_count` was called assert textstat.sentence_count.cache_info().misses == 1 # Call `avg_sentence_length` again, but clear it's cache first textstat.avg_sentence_length.cache_clear() textstat.avg_sentence_length(long_test) # Test that `sentence_count` wasn't called again assert textstat.sentence_count.cache_info().hits == 1
def compute_readability_stats(text): """ Compute reading statistics of the given text Reference: https://github.com/shivam5992/textstat Parameters ========== text: str, input section or abstract text """ try: readability_dict = { 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'smog': textstat.smog_index(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'dale_chall': textstat.dale_chall_readability_score(text), 'difficult_words': textstat.difficult_words(text), 'linsear_write': textstat.linsear_write_formula(text), 'gunning_fog': textstat.gunning_fog(text), 'text_standard': textstat.text_standard(text), 'n_syllable': textstat.syllable_count(text), 'avg_letter_per_word': textstat.avg_letter_per_word(text), 'avg_sentence_length': textstat.avg_sentence_length(text) } except: readability_dict = { 'flesch_reading_ease': None, 'smog': None, 'flesch_kincaid_grade': None, 'coleman_liau_index': None, 'automated_readability_index': None, 'dale_chall': None, 'difficult_words': None, 'linsear_write': None, 'gunning_fog': None, 'text_standard': None, 'n_syllable': None, 'avg_letter_per_word': None, 'avg_sentence_length': None } return readability_dict
def test_avg_sentence_length(self): avg = textstat.avg_sentence_length(self.long_test) self.assertEqual(23.3, avg)
def test_avg_sentence_length(): textstat.set_lang("en_US") avg = textstat.avg_sentence_length(long_test) assert avg == 23.3
def get_redability_assessments(data_text: str) -> Optional[dict]: divided_text = tokenize.sent_tokenize(data_text) word_tokenizes = nltk.word_tokenize(data_text) pos_tags = nltk.pos_tag(word_tokenizes) pos_tags_tagger = TAGGER.tag(word_tokenizes) f_dist = nltk.FreqDist(word_tokenizes) uniqueWordCount = compute_unique_word_count(f_dist.most_common()) paragraphCount = max(len(data_text.split('\n')), len(data_text.split('\r\n'))) counts = Counter(tag for word, tag in pos_tags) # Readability Grade Levels readability_grade_levels = dict(fleschKincaid=0, gunningFog=0, colemanLiau=0, smog=0, ari=0, forecastGradeLevel=0, powersSumnerKearlGrade=0, rix=0, raygorReadability=0, fryReadability=0, flesch=0) readability_grade_levels.update(fleschKincaid=textstat.flesch_kincaid_grade(data_text)) readability_grade_levels.update(gunningFog=textstat.gunning_fog(data_text)) readability_grade_levels.update(colemanLiau=textstat.coleman_liau_index(data_text)) readability_grade_levels.update(smog=textstat.smog_index(data_text)) readability_grade_levels.update(ari=textstat.automated_readability_index(data_text)) readability_grade_levels.update(rix=textstat.rix(data_text)) # need to check readability_grade_levels.update(forcastGradeLevel=round(20 - (textstat.avg_syllables_per_word(data_text) / 10), 2)) readability_grade_levels.update(powersSumnerKearlGrade=round(textstat.avg_sentence_length(data_text) + textstat.avg_syllables_per_word(data_text) + 2.7971, 2)) readability_grade_levels.update(raygorReadability=count_raygor_readability(divided_text)) readability_grade_levels.update(fryReadability=count_fry_readability(divided_text)) # need to check readability_grade_levels.update(flesch=textstat.flesch_reading_ease(data_text)) # Readability Scores readability_scores = dict(readableRating="", fleschReadingEase=0, cefrLevel='', ieltsLevel='', spacheScore=0, newDaleChallScore=0, lixReadability=0, lensearWrite=0) readability_scores.update(readableRating=count_average_grade_levels(readability_grade_levels)) readability_scores.update(fleschReadingEase=textstat.flesch_reading_ease(data_text)) readability_scores.update(cefrLevel=count_cefr_levels(readability_grade_levels)) readability_scores.update(ieltsLevel=count_ielts_levels(readability_grade_levels)) readability_scores.update(spacheScore=round(textstat.spache_readability(data_text), 2)) readability_scores.update(newDaleChallScore=textstat.dale_chall_readability_score_v2(data_text)) readability_scores.update(lixReadability=textstat.lix(data_text)) readability_scores.update(lensearWrite=textstat.linsear_write_formula(data_text)) # Text Statistics text_statistics = dict(characterCount=0, syllableCount=0, wordCount=0, uniqueWordCount=0, sentenceCount=0, paragraphCount=0) text_statistics.update(characterCount=textstat.char_count(data_text)) text_statistics.update(syllableCount=textstat.syllable_count(data_text)) text_statistics.update(wordCount=textstat.lexicon_count(data_text)) text_statistics.update(uniqueWordCount=uniqueWordCount) text_statistics.update(sentenceCount=textstat.sentence_count(data_text)) text_statistics.update(paragraphCount=paragraphCount) # Timings timings_statistics = dict(readingTime=0, speakingTime=0) timings_statistics.update(readingTime=reading_time(textstat.lexicon_count(data_text))) timings_statistics.update(speakingTime=speaking_time(textstat.lexicon_count(data_text))) # Text Composition text_composition = dict(adjectives=0, adverbs=0, conjunctions=0, determiners=0, interjections=0, nouns=0, verbs=0, properNouns=0, prepositions=0, pronouns=0, qualifiers=0, unrecognised=0, nonWords=0) text_composition.update(adjectives=counts.get('JJ', 0) + counts.get('JJR', 0) + counts.get('JJS', 0)) text_composition.update(adverbs=counts.get('RB', 0) + counts.get('RBR', 0) + counts.get('RBS', 0)) text_composition.update(conjunctions=counts.get('CC', 0)) text_composition.update(determiners=counts.get('DT', 0) + counts.get('PDT', 0) + counts.get('WDT', 0)) text_composition.update(interjections=counts.get('UH', 0)) text_composition.update(nouns=counts.get('NN', 0) + counts.get('NNS', 0)) text_composition.update( verbs=counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get( 'VBP', 0) + counts.get('VBZ', 0)) text_composition.update(properNouns=counts.get('NNP', 0) + counts.get('NNPS', 0)) text_composition.update(prepositions=counts.get('IN', 0)) text_composition.update( pronouns=counts.get('PRP', 0) + counts.get('PRP$', 0) + counts.get('WP', 0) + counts.get('WP$', 0)) text_composition.update(qualifiers=counts.get('RB', 0)) text_composition.update(unrecognised=counts.get(None, 0)) text_composition.update(nonWords=counts.get('.', 0) + counts.get(',', 0) + counts.get(':', 0)) # Readability Issues text_readability_issues = dict(sentences30SyllablesCount=0, sentences20SyllablesCount=0, sentences30Syllables=[], sentences20Syllables=[], words4SyllablesCount=0, words12LettersCount=0, words4Syllables=[], words12Letters=[]) sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count = count_sentences_syllables( divided_text) sentences_30_syllables = find_limit_offcet(data_text, sentences_30_syllables, "sentences_30_syllables", "sentences_30_syllables", "This sentence has more than 30 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") sentences_20_syllables = find_limit_offcet(data_text, sentences_20_syllables, "sentences_20_syllables", "sentences_20_syllables", "This sentence has more than 20 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") text_readability_issues.update(sentences30SyllablesCount=sentences_30_count, sentences20SyllablesCount=sentences_20_count) words_12_letters, words_12_count, words_4_syllables, words_4_count = words_sentence_syllables(divided_text) words_12_letters = find_limit_offcet(data_text, words_12_letters, "words_12_letters", "words_12_letters", "This word is more than 12 letters", "Readability Issues") words_4_syllables = find_limit_offcet(data_text, words_4_syllables, "words_4_syllables", "words_4_syllables", "This word is more than 4 syllables", "Readability Issues") text_readability_issues.update(words4SyllablesCount=words_4_count, words12LettersCount=words_12_count) # Writing Style Issues text_style_issues = dict(passiveVoiceCount=0, passiveVoices=[], adverbsCount=0, adverbs=[], clicheCount=0, cliches=[]) passive_voises_return = find_passives(divided_text) passive_voises_return = find_limit_offcet(data_text, passive_voises_return, "passive_voises", "passive_voises", "Too much of using passive voises", "Writing Style Issues") adverbs_return = find_adverbs(pos_tags_tagger) adverbs_return = find_limit_offcet(data_text, adverbs_return, "adverbs", # writing_style_issues "adverbs", "Too much of using adverbs", "Writing Style Issues") text_style_issues.update(passiveVoiceCount=len(passive_voises_return), adverbsCount=len(adverbs_return)) # Text Density Issues text_density_issues = dict(charactersPerWord=0, syllablesPerWord=0, wordsPerSentence=0, wordsPerParagraph=0, sentencesPerParagraph=0) text_density_issues.update(charactersPerWord=textstat.avg_character_per_word(data_text), syllablesPerWord=textstat.avg_syllables_per_word(data_text), wordsPerSentence=round(textstat.lexicon_count(data_text) / len(divided_text), 2), wordsPerParagraph=round(textstat.lexicon_count(data_text) / paragraphCount, 2), sentencesPerParagraph=round(len(divided_text) / paragraphCount, 2)) # Language Issues text_language_issues = dict(spellingIssuesCount=0, grammarIssueCount=0) matches_limit_offcet = sentences_20_syllables + sentences_30_syllables + words_4_syllables + words_12_letters + \ passive_voises_return + adverbs_return return dict(readabilityGradeLevels=readability_grade_levels, readabilityScores=readability_scores, textStatistics=text_statistics, timings=timings_statistics, textComposition=text_composition, textReadabilityIssues=text_readability_issues, textStyleIssues=text_style_issues, textDensityIssues=text_density_issues, textLanguageIssues=text_language_issues, matches=matches_limit_offcet)
def test_avg_sentence_length(): avg = textstat.avg_sentence_length(long_test) assert avg == 23.3