def test_char_count(): textstat.set_lang("en_US") count = textstat.char_count(long_test) count_spaces = textstat.char_count(long_test, ignore_spaces=False) assert count == 1750 assert count_spaces == 2123
def test_char_count(): count = textstat.char_count(long_test) count_spaces = textstat.char_count( long_test, ignore_spaces=False ) assert count == 1750 assert count_spaces == 2123
def __init__(self, text): self.__doc = self.preprocess(text) self.__docWords = self.getTotalWords() self.__totalWords = textstat.lexicon_count(self.__doc, removepunct=True) self.__totalCharacters = textstat.char_count(self.__doc, ignore_spaces=True) self.__totalSentences = self.getSentencesCount() self.__totalSyllables = self.getSyllablesCount() # self.__totalSyllables = textstat.syllable_count(self.__doc) self.__polySyllableCount = self.getPolySyllableCount()
def test_char_count(self): count = textstat.char_count(self.long_test) count_spaces = textstat.char_count(self.long_test, ignore_spaces=False) self.assertEqual(1750, count) self.assertEqual(2123, count_spaces)
def get_redability_assessments(data_text: str) -> Optional[dict]: divided_text = tokenize.sent_tokenize(data_text) word_tokenizes = nltk.word_tokenize(data_text) pos_tags = nltk.pos_tag(word_tokenizes) pos_tags_tagger = TAGGER.tag(word_tokenizes) f_dist = nltk.FreqDist(word_tokenizes) uniqueWordCount = compute_unique_word_count(f_dist.most_common()) paragraphCount = max(len(data_text.split('\n')), len(data_text.split('\r\n'))) counts = Counter(tag for word, tag in pos_tags) # Readability Grade Levels readability_grade_levels = dict(fleschKincaid=0, gunningFog=0, colemanLiau=0, smog=0, ari=0, forecastGradeLevel=0, powersSumnerKearlGrade=0, rix=0, raygorReadability=0, fryReadability=0, flesch=0) readability_grade_levels.update(fleschKincaid=textstat.flesch_kincaid_grade(data_text)) readability_grade_levels.update(gunningFog=textstat.gunning_fog(data_text)) readability_grade_levels.update(colemanLiau=textstat.coleman_liau_index(data_text)) readability_grade_levels.update(smog=textstat.smog_index(data_text)) readability_grade_levels.update(ari=textstat.automated_readability_index(data_text)) readability_grade_levels.update(rix=textstat.rix(data_text)) # need to check readability_grade_levels.update(forcastGradeLevel=round(20 - (textstat.avg_syllables_per_word(data_text) / 10), 2)) readability_grade_levels.update(powersSumnerKearlGrade=round(textstat.avg_sentence_length(data_text) + textstat.avg_syllables_per_word(data_text) + 2.7971, 2)) readability_grade_levels.update(raygorReadability=count_raygor_readability(divided_text)) readability_grade_levels.update(fryReadability=count_fry_readability(divided_text)) # need to check readability_grade_levels.update(flesch=textstat.flesch_reading_ease(data_text)) # Readability Scores readability_scores = dict(readableRating="", fleschReadingEase=0, cefrLevel='', ieltsLevel='', spacheScore=0, newDaleChallScore=0, lixReadability=0, lensearWrite=0) readability_scores.update(readableRating=count_average_grade_levels(readability_grade_levels)) readability_scores.update(fleschReadingEase=textstat.flesch_reading_ease(data_text)) readability_scores.update(cefrLevel=count_cefr_levels(readability_grade_levels)) readability_scores.update(ieltsLevel=count_ielts_levels(readability_grade_levels)) readability_scores.update(spacheScore=round(textstat.spache_readability(data_text), 2)) readability_scores.update(newDaleChallScore=textstat.dale_chall_readability_score_v2(data_text)) readability_scores.update(lixReadability=textstat.lix(data_text)) readability_scores.update(lensearWrite=textstat.linsear_write_formula(data_text)) # Text Statistics text_statistics = dict(characterCount=0, syllableCount=0, wordCount=0, uniqueWordCount=0, sentenceCount=0, paragraphCount=0) text_statistics.update(characterCount=textstat.char_count(data_text)) text_statistics.update(syllableCount=textstat.syllable_count(data_text)) text_statistics.update(wordCount=textstat.lexicon_count(data_text)) text_statistics.update(uniqueWordCount=uniqueWordCount) text_statistics.update(sentenceCount=textstat.sentence_count(data_text)) text_statistics.update(paragraphCount=paragraphCount) # Timings timings_statistics = dict(readingTime=0, speakingTime=0) timings_statistics.update(readingTime=reading_time(textstat.lexicon_count(data_text))) timings_statistics.update(speakingTime=speaking_time(textstat.lexicon_count(data_text))) # Text Composition text_composition = dict(adjectives=0, adverbs=0, conjunctions=0, determiners=0, interjections=0, nouns=0, verbs=0, properNouns=0, prepositions=0, pronouns=0, qualifiers=0, unrecognised=0, nonWords=0) text_composition.update(adjectives=counts.get('JJ', 0) + counts.get('JJR', 0) + counts.get('JJS', 0)) text_composition.update(adverbs=counts.get('RB', 0) + counts.get('RBR', 0) + counts.get('RBS', 0)) text_composition.update(conjunctions=counts.get('CC', 0)) text_composition.update(determiners=counts.get('DT', 0) + counts.get('PDT', 0) + counts.get('WDT', 0)) text_composition.update(interjections=counts.get('UH', 0)) text_composition.update(nouns=counts.get('NN', 0) + counts.get('NNS', 0)) text_composition.update( verbs=counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get( 'VBP', 0) + counts.get('VBZ', 0)) text_composition.update(properNouns=counts.get('NNP', 0) + counts.get('NNPS', 0)) text_composition.update(prepositions=counts.get('IN', 0)) text_composition.update( pronouns=counts.get('PRP', 0) + counts.get('PRP$', 0) + counts.get('WP', 0) + counts.get('WP$', 0)) text_composition.update(qualifiers=counts.get('RB', 0)) text_composition.update(unrecognised=counts.get(None, 0)) text_composition.update(nonWords=counts.get('.', 0) + counts.get(',', 0) + counts.get(':', 0)) # Readability Issues text_readability_issues = dict(sentences30SyllablesCount=0, sentences20SyllablesCount=0, sentences30Syllables=[], sentences20Syllables=[], words4SyllablesCount=0, words12LettersCount=0, words4Syllables=[], words12Letters=[]) sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count = count_sentences_syllables( divided_text) sentences_30_syllables = find_limit_offcet(data_text, sentences_30_syllables, "sentences_30_syllables", "sentences_30_syllables", "This sentence has more than 30 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") sentences_20_syllables = find_limit_offcet(data_text, sentences_20_syllables, "sentences_20_syllables", "sentences_20_syllables", "This sentence has more than 20 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.", "Readability Issues") text_readability_issues.update(sentences30SyllablesCount=sentences_30_count, sentences20SyllablesCount=sentences_20_count) words_12_letters, words_12_count, words_4_syllables, words_4_count = words_sentence_syllables(divided_text) words_12_letters = find_limit_offcet(data_text, words_12_letters, "words_12_letters", "words_12_letters", "This word is more than 12 letters", "Readability Issues") words_4_syllables = find_limit_offcet(data_text, words_4_syllables, "words_4_syllables", "words_4_syllables", "This word is more than 4 syllables", "Readability Issues") text_readability_issues.update(words4SyllablesCount=words_4_count, words12LettersCount=words_12_count) # Writing Style Issues text_style_issues = dict(passiveVoiceCount=0, passiveVoices=[], adverbsCount=0, adverbs=[], clicheCount=0, cliches=[]) passive_voises_return = find_passives(divided_text) passive_voises_return = find_limit_offcet(data_text, passive_voises_return, "passive_voises", "passive_voises", "Too much of using passive voises", "Writing Style Issues") adverbs_return = find_adverbs(pos_tags_tagger) adverbs_return = find_limit_offcet(data_text, adverbs_return, "adverbs", # writing_style_issues "adverbs", "Too much of using adverbs", "Writing Style Issues") text_style_issues.update(passiveVoiceCount=len(passive_voises_return), adverbsCount=len(adverbs_return)) # Text Density Issues text_density_issues = dict(charactersPerWord=0, syllablesPerWord=0, wordsPerSentence=0, wordsPerParagraph=0, sentencesPerParagraph=0) text_density_issues.update(charactersPerWord=textstat.avg_character_per_word(data_text), syllablesPerWord=textstat.avg_syllables_per_word(data_text), wordsPerSentence=round(textstat.lexicon_count(data_text) / len(divided_text), 2), wordsPerParagraph=round(textstat.lexicon_count(data_text) / paragraphCount, 2), sentencesPerParagraph=round(len(divided_text) / paragraphCount, 2)) # Language Issues text_language_issues = dict(spellingIssuesCount=0, grammarIssueCount=0) matches_limit_offcet = sentences_20_syllables + sentences_30_syllables + words_4_syllables + words_12_letters + \ passive_voises_return + adverbs_return return dict(readabilityGradeLevels=readability_grade_levels, readabilityScores=readability_scores, textStatistics=text_statistics, timings=timings_statistics, textComposition=text_composition, textReadabilityIssues=text_readability_issues, textStyleIssues=text_style_issues, textDensityIssues=text_density_issues, textLanguageIssues=text_language_issues, matches=matches_limit_offcet)
#Read in data data = pd.read_csv("INPUT_2.txt", sep='\t', header=None, names=['id', 'target', 'tweet'], encoding='utf-8') data.drop(['id', 'target'], axis=1, inplace=True) #Total number of tweets print("Total number of tweets is ", len(data)) #Total number of characters char = 0 i = 0 while i < len(data): char = char + textstat.char_count(data['tweet'][i]) i += 1 print("Total number of characters is ", char) #Total number of distinct words twt = TweetTokenizer(strip_handles=True, reduce_len=True) words = [] i = 0 while i < len(data): for word in twt.tokenize(data['tweet'][i]): words.append(word) i += 1 print("Total number of distinct words is ", len(set(words))) #Avg number of of characters in each tweet
def test_char_count(): count = textstat.char_count(long_test) count_spaces = textstat.char_count(long_test, ignore_spaces=False) assert count == 1750 assert count_spaces == 2123
characters_based_features['number_of_close_paranthesis'] += 1 if k == '()': characters_based_features['number_of_pair_paranthesis'] += 1 if k == '[': characters_based_features['number_of_open_squareBracket'] += 1 if k == ']': characters_based_features['number_of_close_squareBracket'] += 1 if k == '[]': characters_based_features['number_of_pair_squareBracket'] += 1 if k == '{': characters_based_features['number_of_open_curlyBracket'] += 1 if k == '}': characters_based_features['number_of_close_curlyBracket'] += 1 if k == '{}': characters_based_features['number_of_pair_curlyBracket'] += 1 characters_based_features['number_of_characters'] = textstat.char_count( i, ignore_spaces=False) characters_based_features['number_of_quotation'] = len( re.compile(r"\"").findall(i)) characters_based_features['number_of_numbersigns'] = len( re.compile(r"\#").findall(i)) characters_based_features['number_of_dollar'] = len( re.compile(r"\$").findall(i)) characters_based_features['number_of_percent'] = len( re.compile(r"\%").findall(i)) characters_based_features['number_of_ampersand'] = len( re.compile(r"\&").findall(i)) characters_based_features['number_of_asterisk'] = len( re.compile(r"\*").findall(i)) characters_based_features['number_of_plus'] = len( re.compile(r"\+").findall(i)) characters_based_features['number_of_minus'] = len(