def get_special_metrics(text): blob = TextBlob(text) main = { "statistics": { "syllables": textstat.syllable_count(text), "words": textstat.lexicon_count(text), "characters": textstat.char_count(text), "polysyllables": textstat.polysyllabcount(text), "average letter per word": textstat.avg_letter_per_word(text), "average sentence length": textstat.avg_sentence_length(text), "average sentence per word": textstat.avg_sentence_per_word(text), "sentences": textstat.sentence_count(text), }, "difficulty": { "flesch reading ease": textstat.flesch_reading_ease(text), "smog index": textstat.smog_index(text), "flesch kincaid grade": textstat.flesch_kincaid_grade(text), "coleman liau index": textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), "gunning fog": textstat.gunning_fog(text), }, "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity}, } return main
def get_special_metrics(text): blob = TextBlob(text) main = { 'statistics': { 'syllables': textstat.syllable_count(text), 'words': textstat.lexicon_count(text), 'characters': textstat.char_count(text), 'polysyllables': textstat.polysyllabcount(text), 'average letter per word': textstat.avg_letter_per_word(text), 'average sentence length': textstat.avg_sentence_length(text), 'average sentence per word': textstat.avg_sentence_per_word(text), 'sentences': textstat.sentence_count(text) }, 'difficulty': { 'flesch reading ease': textstat.flesch_reading_ease(text), 'smog index': textstat.smog_index(text), 'flesch kincaid grade': textstat.flesch_kincaid_grade(text), 'coleman liau index': textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), 'gunning fog': textstat.gunning_fog(text) }, 'sentiments': { 'polarity': blob.sentiment.polarity, 'subjectivity': blob.sentiment.subjectivity } } return main
def _calculate_scores(self, docs): docs_scores = [] for doc in docs: scores = {} scores['chars'] = ts.char_count(doc) scores['words'] = ts.lexicon_count(doc) scores['sents'] = ts.sentence_count(doc) #scores['syllables'] = ts.syllable_count(doc) scores['avg_sent_length'] = ts.avg_sentence_length(doc) scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc) scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc) scores['flesch'] = ts.flesch_reading_ease(doc) #scores['smog'] = ts.smog_index(doc) #scores['coleman_liau'] = ts.coleman_liau_index(doc) scores['automated_readability'] = ts.automated_readability_index( doc) #scores['linsear'] = ts.linsear_write_formula(doc) #scores['difficult_words'] = ts.difficult_words(doc) scores['dale_chall'] = ts.dale_chall_readability_score(doc) #scores['gunning_fog'] = ts.gunning_fog(doc) scores['lix'] = ts.lix(doc) docs_scores.append(scores) return docs_scores
def composition(text, file): char_count = textstat.char_count(text) syll_count = textstat.syllable_count(text) lex_count = textstat.lexicon_count(text) sent_count = textstat.sentence_count(text) file.write( '\nChar count : %d\nSyllabus count : %d \nLexicon count : %d \nSentence count : %d' % (char_count, syll_count, lex_count, sent_count))
def scores_cal_ori(text): char_count_value=textstat.char_count(text,ignore_spaces=True) lexicon_count_value=textstat.lexicon_count(text,removepunct=True) syllable_count_value=textstat.syllable_count(text) sentence_count_value=textstat.sentence_count(text) avg_sentence_length_value=textstat.avg_sentence_length(text) avg_syllables_per_word_value=textstat.avg_syllables_per_word(text) avg_letter_per_word_value=textstat.avg_letter_per_word(text) avg_sentence_per_word_value=textstat.avg_sentence_per_word(text) flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text) smog_index_value=textstat.smog_index(text) gunning_fog_value=textstat.gunning_fog(text) difficult_words_value=textstat.difficult_words(text) dale_chall_value=textstat.dale_chall_readability_score(text) polysyllab_value=textstat.polysyllabcount(text) return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value return smog_index_value
def stats(self, text): test_data = text stats = {} stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data) stats['smog'] = textstat.smog_index(test_data) stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data) stats['coleman Liau'] = textstat.coleman_liau_index(test_data) stats['automated'] = textstat.automated_readability_index(test_data) stats['dale chall'] = textstat.dale_chall_readability_score(test_data) stats['difficult'] = textstat.difficult_words(test_data) stats['linsear'] = textstat.linsear_write_formula(test_data) stats['gunning_fog'] = textstat.gunning_fog(test_data) stats['standard'] = textstat.text_standard(test_data) stats['charcount'] = textstat.char_count(test_data) stats['lexicon count'] = textstat.lexicon_count(test_data) stats['syllable count'] = textstat.syllable_count(test_data) stats['sentence count'] = textstat.sentence_count(test_data) stats['avg sentence length'] = textstat.avg_sentence_length(test_data) stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word( test_data) stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data) stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word( test_data) return stats
def updateData(self): # Full list of polarity scores self.polscore = self.sid.polarity_scores(self.text) ##### INDEX 0 IN DATA: Text Sentiment ##### # [INDEX 0] Compounded score (0.0 - 1.0) [INDEX 1] Negative connotation rating (0.0 - 1.0), # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0) self.data.append([ self.polscore['compound'], self.polscore['neg'], self.polscore['pos'], self.polscore['neu'] ]) ##### INDEX 1 IN DATA: Sentence Info ##### # [INDEX 0] Sentence count [INDEX 1] Average sentence length # [INDEX 2] Syllable count [INDEX 3] Overall word count # [INDEX 4] Character count [INDEX 5] Character count without spaces # [INDEX 6] Avg letters per word [INDEX 7] Avg syllables per word self.data.append([ textstat.sentence_count(self.text), textstat.avg_sentence_length(self.text), textstat.syllable_count(self.text), len(self.splList), textstat.char_count(self.text, False), textstat.char_count(self.text, True), textstat.avg_letter_per_word(self.text), textstat.avg_syllables_per_word(self.text) ]) ##### INDEX 2 IN DATA: Flesch Reading Ease ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 100 self.freRaw = textstat.flesch_reading_ease(self.text) self.freStat = min(max(self.freRaw, 0), 100) self.data.append([ round(self.freStat, 3), self.freGrade(self.freStat), round(abs(self.freStat - 100), 2) ]) ##### INDEX 3 IN DATA: Flesch-Kincaid Grade ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fkgRaw = textstat.flesch_kincaid_grade(self.text) self.fkgStat = self.adjustScore(self.fkgRaw) self.data.append([ round(self.fkgStat, 3), self.grade(self.fkgStat), round(self.fkgStat / 0.18, 2) ]) ##### INDEX 4 IN DATA: Gunning FOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fogRaw = textstat.gunning_fog(self.text) self.fogStat = self.adjustScore(self.fogRaw) self.data.append([ round(self.fogStat, 3), self.grade(self.fogStat), round(self.fogStat / 0.18, 2) ]) ##### INDEX 5 IN DATA: SMOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.smogRaw = textstat.smog_index(self.text) self.smogStat = self.adjustScore(self.smogRaw) self.data.append([ round(self.smogStat, 3), self.grade(self.smogStat), round(self.smogStat / 0.18, 2) ]) ##### INDEX 6 IN DATA: Automated Readability Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 14 self.ariRaw = textstat.automated_readability_index(self.text) self.ariStat = min(max(self.ariRaw, 0), 14) self.data.append([ round(self.ariStat, 3), self.ariGrade(ceil(self.ariStat)), round(self.ariStat / 0.14, 2) ]) #13 ##### INDEX 7 IN DATA: Coleman-Liau Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.cliRaw = textstat.coleman_liau_index(self.text) self.cliStat = self.adjustScore(self.cliRaw) self.data.append([ round(self.cliStat, 3), self.grade(self.cliStat), round(self.cliStat / 0.18, 2) ]) ##### INDEX 8 IN DATA: Linsear Write Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.lwiRaw = textstat.linsear_write_formula(self.text) self.lwiStat = self.adjustScore(self.lwiRaw) self.data.append([ round(self.lwiStat, 3), self.grade(self.lwiStat), round(self.lwiStat / 0.18, 2) ]) ##### INDEX 9 IN DATA: Dale-Chall Readability Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 10 self.dcrRaw = textstat.dale_chall_readability_score(self.text) self.dcrStat = min(max(self.dcrRaw, 0), 10) self.data.append([ round(self.dcrStat, 3), self.daleChallGrade(self.dcrStat), round(self.dcrStat / 0.1, 2) ]) ##### INDEX 10 IN DATA: Overall Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 20 self.txtRaw = textstat.text_standard(self.text, True) self.txtStd = min(max(self.txtRaw, 0), 20) self.txtInfo = textstat.text_standard(self.text) self.data.append([ round(self.txtStd, 3), self.txtGrade(self.txtStd, self.txtInfo), round(self.txtStd / 0.2, 2) ]) return self.data
def test_char_count(self): count = textstat.char_count(self.long_test) count_spaces = textstat.char_count(self.long_test, ignore_spaces=False) self.assertEqual(1750, count) self.assertEqual(2123, count_spaces)
def preprocess(x): print('PROCESSING ID: ' + str(x['id'])) try: fvec = [] fvec.append(int(x['id'])) # Append Article ID fvec.append(nnp_num(x['targetTitle'])) if len(x['targetParagraphs']) > 0: fvec.append( ts.automated_readability_index(' '.join( x['targetParagraphs']))) fvec.append(ts.avg_letter_per_word(' '.join( x['targetParagraphs']))) fvec.append(ts.avg_sentence_length(' '.join( x['targetParagraphs']))) fvec.append( ts.avg_sentence_per_word(' '.join(x['targetParagraphs']))) fvec.append( ts.avg_syllables_per_word(' '.join(x['targetParagraphs']))) fvec.append(ts.char_count(' '.join(x['targetParagraphs']))) fvec.append(ts.coleman_liau_index(' '.join(x['targetParagraphs']))) fvec.append( ts.dale_chall_readability_score(' '.join( x['targetParagraphs']))) fvec.append(ts.difficult_words(' '.join(x['targetParagraphs']))) fvec.append( ts.flesch_kincaid_grade(' '.join(x['targetParagraphs']))) fvec.append(ts.flesch_reading_ease(' '.join( x['targetParagraphs']))) fvec.append(ts.gunning_fog(' '.join(x['targetParagraphs']))) fvec.append(ts.lexicon_count(' '.join(x['targetParagraphs']))) fvec.append( ts.linsear_write_formula(' '.join(x['targetParagraphs']))) fvec.append(ts.polysyllabcount(' '.join(x['targetParagraphs']))) fvec.append(ts.sentence_count(' '.join(x['targetParagraphs']))) fvec.append(ts.smog_index(' '.join(x['targetParagraphs']))) fvec.append(ts.syllable_count(' '.join(x['targetParagraphs']))) fvec.append(mean_wordlen(x['targetParagraphs'])) fvec += ratio(x['targetParagraphs']) #36 fvec += ngram_feat(x['targetParagraphs']) # 6 else: fvec += [0] * 61 if len(word_tokenize(' '.join(x['postText']))) > 0: fvec.append(max_wordlen(x['postText'])) fvec.append(sw_ratio(' '.join(x['postText']))) fvec += ngram_feat(x['postText']) #6 else: fvec += [0] * 8 fvec.append(len(word_tokenize(x['targetTitle']))) fvec.append(wlen_title(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'NNP')) fvec.append(int(num_start(x['targetTitle']))) fvec.append(in_num(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'VBZ')) fvec.append(pos_2gram(x['targetTitle'], 'IN', 'NNP')) fvec.append(wrb_num(x['targetTitle'])) fvec.append(nnp_num(x['targetTitle'])) fvec.append(int(wh_start(x['targetTitle']))) fvec.append(int(qm_exist(x['targetTitle']))) fvec.append(pos_thnn(x['targetTitle'])) fvec.append(prp_count(x['targetTitle'])) fvec.append(vbz_count(x['targetTitle'])) fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'VBZ')) fvec.append(pos_2gram(x['targetTitle'], 'NN', 'IN')) fvec.append(pos_3gram(x['targetTitle'], 'NN', 'IN', 'NNP')) fvec.append(pos_2gram(x['targetTitle'], 'NNP', '.')) fvec.append(pos_2gram(x['targetTitle'], 'PRP', 'VBP')) fvec.append(wp_count(x['targetTitle'])) fvec.append(dt_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'IN')) fvec.append(pos_3gram(x['targetTitle'], 'IN', 'NNP', 'NNP')) fvec.append(pos_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'IN', 'NN')) if len(x['targetKeywords']) > 0 and len(x['postText']) > 0: fvec.append(kw_post_match(x['targetKeywords'], x['postText'])) else: fvec += [0] * 1 fvec.append(comma_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'NNS')) fvec.append(pos_2gram(x['targetTitle'], 'IN', 'JJ')) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'POS')) fvec.append(wdt_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NN', 'NN')) fvec.append(pos_2gram(x['targetTitle'], 'NN', 'NNP')) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'VBD')) fvec.append(rb_count(x['targetTitle'])) fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'NNP')) fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'NN')) fvec.append(rbs_count(x['targetTitle'])) fvec.append(vbn_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'VBN', 'IN')) fvec.append(pos_2gram(x['targetTitle'], 'JJ', 'NNP')) fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NN', 'NN')) fvec.append(pos_2gram(x['targetTitle'], 'DT', 'NN')) fvec.append(ex_exist(x['targetTitle'])) fvec += ngram_feat(x['targetTitle']) #6 except Exception as e: print('EXCEPTION AT ID ' + str(x['id'])) print(e) sys.exit() return fvec
re.findall(r'\b(he|she|it|his|hers|him|her|they|them|their)\b', str(tokens), flags=re.I)) prn_density = round(prn / len(tokens_np), 5) try: prn_noun_ratio = round(prn / len(nouns), 2) except ZeroDivisionError: prn_noun_ratio = 0 ## Readability features num_syllab = textstat.syllable_count(essay) avg_len_sent = textstat.avg_sentence_length(essay) # avg_sent_per_word = textstat.avg_sentence_per_word(essay) # num_polysyllab = textstat.polysyllabcount(essay) num_chars = textstat.char_count(essay, ignore_spaces=True) # avg_syllab_per_word = textstat.avg_syllables_per_word(essay) fre = textstat.flesch_reading_ease(essay) fkg = textstat.flesch_kincaid_grade(essay) cli = textstat.coleman_liau_index(essay) ari = textstat.automated_readability_index(essay) dcrs = textstat.dale_chall_readability_score(essay) dw = textstat.difficult_words(essay) lwf = textstat.linsear_write_formula(essay) gf = textstat.gunning_fog(essay) ## Stages of negation (features to improve validity for AES in ELL contexts) stage1a = len( re.findall(