def lex_readability(self, text, mode='fre'): if mode == 'all': fre_score = textstat.flesch_reading_ease(text) fog_index = textstat.gunning_fog(text) fkg_index = textstat.flesch_kincaid_grade(text) dcr_score = textstat.dale_chall_readability_score(text) text_standard = textstat.text_standard(text, float_output=True) return fre_score, fog_index, fkg_index, dcr_score, text_standard if mode == 'fre': fre_score = textstat.flesch_reading_ease(text) return fre_score if mode == 'fog': fog_index = textstat.gunning_fog(text) return fog_index if mode == 'fkg': fkg_index = textstat.flesch_kincaid_grade(text) return fkg_index if mode == 'dcr': dcr_score = textstat.dale_chall_readability_score(text) return dcr_score if mode == 'text_std': text_standard = textstat.text_standard(text, float_output=True) return text_standard
def seven_test(processed_essay): """ score which is assigned to every script in on the basis of some predifened fomulas These scores are known as readability score. flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write :param processed_essay: :return:flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write """ flesch_score = ["FS"] gunning_index = ["GI"] kincaid_grade = ["KG"] liau_index = ["LI"] automated_readability_index = ["ARI"] dale_readability_score = ["DLS"] difficult_word = ["DW"] linsear_write = ["LW"] for v in processed_essay: flesch_score.append(textstat.flesch_reading_ease(str(v))) gunning_index.append(textstat.gunning_fog(str(v))) kincaid_grade.append(textstat.flesch_kincaid_grade(str(v))) liau_index.append(textstat.coleman_liau_index(str(v))) automated_readability_index.append(textstat.automated_readability_index(str(v))) dale_readability_score.append(textstat.dale_chall_readability_score(str(v))) difficult_word.append(textstat.difficult_words(str(v))) linsear_write.append(textstat.linsear_write_formula(str(v))) return flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write
def readability(text): """ Provides the readability grade for the text. Here we are using the flesch reading ease score. Higher the score, easier to read text: input text on which score has to be calculated """ score = textstat.flesch_reading_ease(text) grade = round(textstat.flesch_kincaid_grade(text)) if score > 90: summary = "Very easy to read. Easily understood by an average 11-year-old student; " elif score > 80: summary = "Easy to read. Conversational English for consumers" elif score > 70: summary = "Fairly easy to read" elif score > 60: summary = "Plain English. Easily understood by 13- to 15-year-old students." elif score > 50: summary = "Fairly difficult to read." elif score > 30: summary = "Difficult to read" else: summary = "Very difficult to read. Best understood by university graduates." return score, summary, grade
def analyze(): print(request) str_to_read = request.data.decode("utf-8").strip() report = { "flesch-reading-ease": textstat.flesch_reading_ease(str_to_read), "smog-index": textstat.smog_index(str_to_read), "flesch-kincaid-grade": textstat.flesch_kincaid_grade(str_to_read), "coleman-liau-index": textstat.coleman_liau_index(str_to_read), "automated-readability-index": textstat.automated_readability_index(str_to_read), "dale-chall-readability-score": textstat.dale_chall_readability_score(str_to_read), "difficult-words": textstat.difficult_words(str_to_read), "linsear-write-formula": textstat.linsear_write_formula(str_to_read), "gunning-fog": textstat.gunning_fog(str_to_read), "text-standard": textstat.text_standard(str_to_read) } return decorate_response(jsonify(report))
def get_readibility(text, metric="flesch_kincaid_grade"): """ Return a score which reveals a piece of text's readability level. Reference: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests """ if metric == "flesch_kincaid_grade": result = textstat.flesch_kincaid_grade(text) elif metric == "flesch_reading_ease": result = textstat.flesch_reading_ease(text) elif metric == "smog_index": result = textstat.smog_index(text) elif metric == "coleman_liau_index": result = textstat.coleman_liau_index(text) elif metric == "automated_readability_index": result = textstat.automated_readability_index(text) elif metric == "dale_chall_readability_score": result = textstat.dale_chall_readability_score(text) elif metric == "difficult_words": result = textstat.difficult_words(text) elif metric == "linsear_write_formula": result = textstat.linsear_write_formula(text) elif metric == "gunning_fog": result = textstat.gunning_fog(text) elif metric == "text_standard": result = textstat.text_standard(text) else: print("ERROR: Please select correct metric!") result = None return result
def calculate_stats(data_folder): """Calculate stat of test.json file in a folder""" data_folder = Path(data_folder) for dataset in dataset_fields: print(f"loading {dataset}") field = dataset_fields[dataset]["text"].strip() sentences = [] for item in json.load(open(data_folder / dataset / "test.json")): sentences.append(item[field][-1] if type(item[field]) == list else item[field]) text = " ".join(sentences) lex_count = textstat.lexicon_count(text) print(lex_count) unique_words = count_words(text) print(f"all unique {len(unique_words)}") lower_unique_words = count_words(text, casing="lower") print(f"lowercase unique {len(lower_unique_words)}") upper_unique_words = count_words(text, casing="upper") print(f"uppercase unique {len(upper_unique_words)}") print(f"ratio {len(upper_unique_words) / len(unique_words)}") text_standard = textstat.text_standard(text, float_output=True) print(f"text_standard: {text_standard}") dale_chall_readability_score = textstat.dale_chall_readability_score(text) print(f"dale_chall_readability_score: {dale_chall_readability_score}") flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) print(f"flesch_kincaid_grade: {flesch_kincaid_grade}")
def readability(queries): scores = pd.DataFrame(columns=[ 'Flesch', 'Smog', 'Flesch grade', 'Coleman', 'Automated', 'Dale', 'Difficult', 'Linsear', 'Gunning', 'Text Standard' ]) scores = { 'Flesch': [], 'Smog': [], 'Flesch grade': [], 'Coleman': [], 'Automated': [], 'Dale': [], 'Difficult': [], 'Linsear': [], 'Gunning': [], 'Text Standard': [] } for line in queries: # results = readability.getmeasures(line, lang='en') # frescores.append(results['readability grades']['FleschReadingEase']) # line = 'yao family wines . yao family wines is a napa valley producer founded in 2011 by yao ming , the chinese-born , five-time nba all star . now retired from the houston rockets , yao ming is the majority owner in yao family wines , which has entered the wine market with a luxury cabernet sauvignon sourced from napa valley vineyards .' scores['Flesch'].append(textstat.flesch_reading_ease(line)) scores['Smog'].append(textstat.smog_index(line)) scores['Flesch grade'].append(textstat.flesch_kincaid_grade(line)) scores['Coleman'].append(textstat.coleman_liau_index(line)) scores['Automated'].append(textstat.automated_readability_index(line)) scores['Dale'].append(textstat.dale_chall_readability_score(line)) scores['Difficult'].append(textstat.difficult_words(line)) scores['Linsear'].append(textstat.linsear_write_formula(line)) scores['Gunning'].append(textstat.gunning_fog(line)) scores['Text Standard'].append( textstat.text_standard(line, float_output=True)) return scores
def fleschkincaid() -> List: """returns Flesch-Kincaid score """ score = [] for text in policies['Policy']: score.append(textstat.flesch_kincaid_grade(text)) return score
def main(): df = pd.DataFrame(columns=['Utility', 'FK Score', 'FK Grade Level']) for x in glob.glob('pdfs/*.pdf'): try: text = parser.from_file(x) df = df.append( { 'Utility': str(x).split('\\')[1].split('.')[0], 'FK Score': textstat.flesch_reading_ease(text['content']), 'FK Grade Level': textstat.flesch_kincaid_grade(text['content']) }, ignore_index=True) except: df = df.append( { 'Utility': str(x).split('\\')[1].split('.')[0], 'FK Score': 'N/A', 'FK Grade Level': 'N/A' }, ignore_index=True) df.to_csv('data/results/readability_results.csv', encoding='utf-8')
def textstat_stats(text): doc_length = len(text.split()) flesch_ease = ts.flesch_reading_ease(text) #Flesch Reading Ease Score flesch_grade = ts.flesch_kincaid_grade(text) #Flesch-Kincaid Grade Level gfog = ts.gunning_fog(text) # FOG index, also indicates grade level # smog = ts.smog_index(text) # SMOG index, also indicates grade level, only useful on 30+ sentences auto_readability = ts.automated_readability_index(text) #approximates the grade level needed to comprehend the text. cl_index = ts.coleman_liau_index(text) #grade level of the text using the Coleman-Liau Formula. lw_formula = ts.linsear_write_formula(text) #grade level using the Linsear Write Formula. dcr_score = ts.dale_chall_readability_score(text) #uses a lookup table of the most commonly used 3000 English words # text_standard = ts.text_standard(text, float_output=False) # summary of all the grade level functions syll_count = ts.syllable_count(text, lang='en_US') syll_count_scaled = syll_count / doc_length lex_count = ts.lexicon_count(text, removepunct=True) lex_count_scaled = lex_count / doc_length idx = ['flesch_ease', 'flesch_grade','gfog', 'auto_readability','cl_index','lw_formula', 'dcr_score', # 'text_standard', 'syll_count', 'lex_count'] return pd.Series([flesch_ease, flesch_grade, gfog, auto_readability, cl_index, lw_formula, dcr_score, # text_standard, syll_count_scaled, lex_count_scaled], index = idx)
def get_stats(text): fre = textstat.flesch_reading_ease(text) smog = textstat.smog_index(text) fkg = textstat.flesch_kincaid_grade(text) cli = textstat.coleman_liau_index(text) ari = textstat.automated_readability_index(text) dcr = textstat.dale_chall_readability_score(text) diff_words = textstat.difficult_words(text) lwf = textstat.linsear_write_formula(text) gunn_fog = textstat.gunning_fog(text) consolidated_score = textstat.text_standard(text) doc_length = len(text) # think about excluding spaces? quote_count = text.count('"') stats = { "flesch_reading_ease": fre, "smog_index": smog, "flesch_kincaid_grade": fkg, "coleman_liau_index": cli, "automated_readability_index": ari, "dale_chall_readability_score": dcr, "difficult_words": diff_words, "linsear_write_formula": lwf, "gunning_fog": gunn_fog, "consolidated_score": consolidated_score, "doc_length": doc_length, "quote_count": quote_count } return stats
def getReadabilityMetrics(test_data): ''' for a given article IN TEXT FORMAT, returns its readability metrics Uses textstat library, please install it ''' metric = { "flesch_reading_ease": textstat.flesch_reading_ease(test_data), "smog_index": textstat.smog_index(test_data), "flesch_kincaid_grade": textstat.flesch_kincaid_grade(test_data), "coleman_liau_index": textstat.coleman_liau_index(test_data), "automated_readability_index": textstat.automated_readability_index(test_data), "dale_chall_readability_score": textstat.dale_chall_readability_score(test_data), "difficult_words": textstat.difficult_words(test_data), "linsear_write_formula": textstat.linsear_write_formula(test_data), "gunning_fog": textstat.gunning_fog(test_data), "text_standard": textstat.text_standard(test_data) } return metric
def get_reading_level(html): ''' Returns the Flesch-Kincaid Grade of the given text. This is a grade formula in that a score of 9.3 means that a ninth grader would be able to read the document. https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level ''' return textstat.flesch_kincaid_grade(get_text(html))
def score(text): a = textstat.flesch_reading_ease(text) b = textstat.flesch_kincaid_grade(text) c = textstat.gunning_fog(text) d = textstat.smog_index(text) e = textstat.coleman_liau_index(text) f = textstat.automated_readability_index(text) return a, b, c, d, e, f
def readability_scores_mp(data): result_dict, idx, text = data # flesch_reading_ease = textstat.flesch_reading_ease(text) flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) dale_chall_readability_score = textstat.dale_chall_readability_score(text) result_dict[idx] = [flesch_kincaid_grade, dale_chall_readability_score]
def do_datas(): # logging.info('do_datas') ########### Save text statistics ##### 1. nw 2. nvocab 3. nsyllable 4.nsentence 5. tone 6. readability ## 1. nw nw.append(len(words)) ## 2. nvocab nvocab.append(len(vocab)) ## 3. syllable n = textstat.syllable_count(contents) nsyllable.append(n) ## 4. sentence n = textstat.sentence_count(contents) nsentence.append(n) ## 5. tone ### LM dictionary n_neg_lm.append(count_occurrence(words, lm_neg)) n_pos_lm.append(count_occurrence(words, lm_pos)) n_uctt_lm.append(count_occurrence(words, lm_uctt)) n_lit_lm.append(count_occurrence(words, lm_lit)) n_cstr_lm.append(count_occurrence(words, lm_cstr)) n_modal1_lm.append(count_occurrence(words, lm_modal1)) n_modal2_lm.append(count_occurrence(words, lm_modal2)) n_modal3_lm.append(count_occurrence(words, lm_modal3)) n_negation_lm.append(count_negation(words, lm_pos, gt_negation)) ### General Inquirer dictionary n_neg_gi.append(count_occurrence(words, gi_neg)) n_pos_gi.append(count_occurrence(words, gi_pos)) n_negation_gi.append(count_negation(words, gi_pos, gt_negation)) ### Henry dictionary n_neg_hr.append(count_occurrence(words, hr_neg)) n_pos_hr.append(count_occurrence(words, hr_pos)) n_negation_hr.append(count_negation(words, gi_pos, gt_negation)) ## 4. readability fre_i = textstat.flesch_reading_ease(contents) if fre_i > 100: fre_i = 100 if fre_i < 0: fre_i = float('NaN') fre.append(fre_i) fkg_i = textstat.flesch_kincaid_grade(contents) if fkg_i < 0: fkg_i = float('NaN') fkg.append(fkg_i) # RIX cl_i = textstat.coleman_liau_index(contents) if cl_i < 0: cl_i = float('NaN') cl.append(cl_i) f = textstat.gunning_fog(contents) fog.append(f) f = textstat.automated_readability_index(contents) ari.append(f) f = textstat.smog_index(contents) smog.append(f)
def metrics(sentence): fk = round(flesch_kincaid_grade(sentence), 3) gf = round(gunning_fog(sentence), 3) dc = round(dale_chall_readability_score(sentence), 3) fk_label = grade_label(round(fk)) gf_label = grade_label(round(gf)) dc_label = grade_label(dale_chall_norm(round(dc))) return (fk, gf, dc, fk_label, gf_label, dc_label)
def doc_calc(self, article): """Helper code to compute average word length of a name""" flesch_ease = textstat.flesch_reading_ease(article) flesch_grade = textstat.flesch_kincaid_grade(article) gunning = textstat.gunning_fog(article) profanity = predict_prob([article])[0] polarity = TextBlob(article).sentiment.polarity return pd.Series( [flesch_ease, flesch_grade, gunning, profanity, polarity])
def compute_readability_stats(text): """ Compute reading statistics of the given text Reference: https://github.com/shivam5992/textstat Parameters ========== text: str, input section or abstract text """ try: readability_dict = { 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'smog': textstat.smog_index(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'dale_chall': textstat.dale_chall_readability_score(text), 'difficult_words': textstat.difficult_words(text), 'linsear_write': textstat.linsear_write_formula(text), 'gunning_fog': textstat.gunning_fog(text), 'text_standard': textstat.text_standard(text), 'n_syllable': textstat.syllable_count(text), 'avg_letter_per_word': textstat.avg_letter_per_word(text), 'avg_sentence_length': textstat.avg_sentence_length(text) } except: readability_dict = { 'flesch_reading_ease': None, 'smog': None, 'flesch_kincaid_grade': None, 'coleman_liau_index': None, 'automated_readability_index': None, 'dale_chall': None, 'difficult_words': None, 'linsear_write': None, 'gunning_fog': None, 'text_standard': None, 'n_syllable': None, 'avg_letter_per_word': None, 'avg_sentence_length': None } return readability_dict
def score(self, strText): self.automated_readability_index = textstat.automated_readability_index( strText) self.str_automated_readability_index = self.grade( self.automated_readability_index) self.coleman_liau_index = textstat.coleman_liau_index(strText) self.str_coleman_liau_index = self.grade(self.coleman_liau_index) self.dale_chall_readability_score = textstat.dale_chall_readability_score( strText) if self.dale_chall_readability_score >= 9.0: self.str_dale_chall_readability_score = ' | ' + '13th to 15th grade (college)' elif self.dale_chall_readability_score >= 8.0: self.str_dale_chall_readability_score = ' | ' + '11th to 12th grade' elif self.dale_chall_readability_score >= 7.0: self.str_dale_chall_readability_score = ' | ' + '9th to 10th grade' elif self.dale_chall_readability_score >= 6.0: self.str_dale_chall_readability_score = ' | ' + '7th to 8th grade' elif self.dale_chall_readability_score >= 5.0: self.str_dale_chall_readability_score = ' | ' + '5th to 6th grade' else: self.str_dale_chall_readability_score = ' | ' + '4th grade or lower' self.difficult_words = textstat.difficult_words(strText) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(strText) self.str_flesch_kincaid_grade = self.grade(self.flesch_kincaid_grade) self.flesch_reading_ease = textstat.flesch_reading_ease(strText) if self.flesch_reading_ease >= 90: self.str_flesch_reading_ease = ' | ' + 'Very Easy' elif self.flesch_reading_ease >= 80: self.str_flesch_reading_ease = ' | ' + 'Easy' elif self.flesch_reading_ease >= 70: self.str_flesch_reading_ease = ' | ' + 'Fairly Easy' elif self.flesch_reading_ease >= 60: self.str_flesch_reading_ease = ' | ' + 'Standard' elif self.flesch_reading_ease >= 50: self.str_flesch_reading_ease = ' | ' + 'Fairly Difficult' elif self.flesch_reading_ease >= 30: self.str_flesch_reading_ease = ' | ' + 'Difficult' else: self.str_flesch_reading_ease = ' | ' + 'Very Confusing' self.gunning_fog = textstat.gunning_fog(strText) self.str_gunning_fog = self.grade(self.gunning_fog) self.linsear_write_formula = textstat.linsear_write_formula(strText) self.str_linsear_write_formula = self.grade(self.linsear_write_formula) self.smog_index = textstat.smog_index(strText) self.str_smog_index = self.grade(self.smog_index) self.text_standard = textstat.text_standard(strText)
def getWordComplexityScore(self,tokens, i): # A higher score means a document takes a higher education level to read if (i == 1): score = textstat.gunning_fog(tokens) elif (i == 2): # Texts of fewer than 30 sentences are statistically invalid, because the SMOG formula was normed on 30-sentence samples. # textstat requires atleast 3 sentences per article for a result. score = textstat.smog_index(tokens) else: score = textstat.flesch_kincaid_grade(tokens) return score
def flesch_kincaid_grade(text): """ Implements the Flesch-Kincaid reading level formula: Flesh-Kincaid Grade Level = 0.39*ASL + 11.8*ASW - 15.59 Here, ASL = average sentence length (number of words divided by number of sentences) ASW = average word length in syllables (number of syllables divided by number of words) :param text: The text :return: a grade level from 0-18, where 0 is the easiest, and 18 is the hardest. The goal is to aim for a score of 8 to ensure that 80% of American can read/understand it. """ return textstat.flesch_kincaid_grade(text)
def create_readability_features(self): """ Adds readability features using textstat library. Numbers represent grade level needed to understand the text. ari: Automated Readability Index """ for df in [self.X_train, self.X_test]: df["review_text_readability_flesch_kincaid"] = df[ "review_text"].apply( lambda x: textstat.flesch_kincaid_grade(x)) df["review_text_ari"] = df["review_text"].apply( lambda x: textstat.automated_readability_index(x))
def calc_readby(sents_series0): fogIndex=[]; flesch_kincaid=[]; flesch_readby=[]; for i0 in range(len(sents_series0)): sent0 = sents_series0[i0] flesch_readby.append(textstat.flesch_reading_ease(sent0)) flesch_kincaid.append(textstat.flesch_kincaid_grade(sent0)) fogIndex.append(textstat.gunning_fog(sent0)) if i0%10000==0: print(i0) df_readby = pd.DataFrame({'flesch_readby':flesch_readby, 'flesch_kincaid':flesch_kincaid, 'fogIndex':fogIndex}) return(df_readby)
def generate_score(self, text): self.flesch_reading_grade = ts.flesch_reading_ease(text) self.flesch_reading_grade_consensus = readability_test_consensus(self.flesch_reading_grade, flesch_ease_grading_system) self.flesch_kincaid_grade = ts.flesch_kincaid_grade(text) self.flesch_kincaid_grade_consensus = readability_test_consensus(self.flesch_kincaid_grade, us_grade_level_system_age) self.dale_chall_grade = ts.dale_chall_readability_score(text) self.dale_chall_grade_consensus = readability_test_consensus(self.dale_chall_grade, dale_chall_system) self.smog_grade = ts.smog_index(text) self.ari_grade = ts.automated_readability_index(text) """ self.ari_grade_consensus = readability_test_consensus(self.ari_grade, us_grade_level_system_level) """ self.coleman_liau_grade = ts.coleman_liau_index(text) pass
def process(self, df): t0 = time() print("\n---Generating Readability Features:---\n") def lexical_diversity(text): words = nltk.tokenize.word_tokenize(text.lower()) word_count = len(words) vocab_size = len(set(words)) diversity_score = vocab_size / word_count return diversity_score def get_counts(text, word_list): words = nltk.tokenize.word_tokenize(text.lower()) count = 0 for word in words: if word in word_list: count += 1 return count df['flesch_reading_ease'] = df['articleBody'].map(lambda x: textstat.flesch_reading_ease(x)) df['smog_index'] = df['articleBody'].map(lambda x: textstat.smog_index(x)) df['flesch_kincaid_grade'] = df['articleBody'].map(lambda x: textstat.flesch_kincaid_grade(x)) df['coleman_liau_index'] = df['articleBody'].map(lambda x: textstat.coleman_liau_index(x)) df['automated_readability_index'] = df['articleBody'].map(lambda x: textstat.automated_readability_index(x)) df['dale_chall_readability_score'] = df['articleBody'].map(lambda x: textstat.dale_chall_readability_score(x)) df['difficult_words'] = df['articleBody'].map(lambda x: textstat.difficult_words(x)) df['linsear_write_formula'] = df['articleBody'].map(lambda x: textstat.linsear_write_formula(x)) df['gunning_fog'] = df['articleBody'].map(lambda x: textstat.gunning_fog(x)) df['i_me_myself'] = df['articleBody'].apply(get_counts,args = (['i', 'me', 'myself'],)) df['punct'] = df['articleBody'].apply(get_counts,args = ([',','.', '!', '?'],)) df['lexical_diversity'] = df['articleBody'].apply(lexical_diversity) feats = ['flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index', 'automated_readability_index', 'dale_chall_readability_score', 'difficult_words', 'linsear_write_formula', 'gunning_fog', 'i_me_myself', 'punct', 'lexical_diversity' ] outfilename_xReadable = df[feats].values with open('../saved_data/read.pkl', 'wb') as outfile: pickle.dump(feats, outfile, -1) pickle.dump(outfilename_xReadable, outfile, -1) print ('readable features saved in read.pkl') print('\n---Readability Features is complete---') print("Time taken {} seconds\n".format(time() - t0)) return 1
def cal_readability(target, source): import pandas as pd tf_r_es = [textstat.flesch_reading_ease(t) for t in target] tf_k_gs = [textstat.flesch_kincaid_grade(t) for t in target] td_c_rs = [textstat.dale_chall_readability_score(t) for t in target] sf_r_es = [textstat.flesch_reading_ease(t) for t in source] sf_k_gs = [textstat.flesch_kincaid_grade(t) for t in source] sd_c_rs = [textstat.dale_chall_readability_score(t) for t in source] diff_r_es = [np.abs(tf_r_es[i] - sf_r_es[i]) for i in range(len(tf_r_es))] diff_k_gs = [np.abs(tf_k_gs[i] - sf_k_gs[i]) for i in range(len(tf_k_gs))] difd_c_rs = [np.abs(td_c_rs[i] - sd_c_rs[i]) for i in range(len(td_c_rs))] return {"Flesch ease mean gen": np.mean(tf_r_es), \ "Flesch ease mean orig": np.mean(sf_r_es), \ "Flesch ease mean diff": np.mean(diff_r_es), \ "Flesch grade mean gen": np.mean(tf_k_gs), \ "Flesch grade mean orig": np.mean(sf_k_gs), \ "Flesch grade mean diff": np.mean(diff_k_gs), \ "Dale Chall Readability V2 mean gen": np.mean(td_c_rs), \ "Dale Chall Readability V2 mean orig": np.mean(sd_c_rs), \ "Dale Chall Readability V2 mean diff": np.mean(difd_c_rs), \ },\ \ {"Flesch ease std dev gen": np.std(tf_r_es), \ "Flesch ease std dev orig": np.std(sf_r_es), \ "Flesch ease std dev diff": np.std(diff_r_es), \ "Flesch grade std dev gen": np.std(tf_k_gs), \ "Flesch grade std dev orig": np.std(sf_k_gs), \ "Flesch grade std dev diff": np.std(diff_k_gs), \ "Dale Chall Readability V2 std dev gen": np.std(td_c_rs),\ "Dale Chall Readability V2 std dev orig": np.std(sd_c_rs),\ "Dale Chall Readability V2 std dev diff": np.std(difd_c_rs)\ }
def readability_scores(self, text): self.ari = textstat.automated_readability_index(text) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) self.coleman_liau_index = textstat.coleman_liau_index(text) self.dale_chall_readability_score = textstat.dale_chall_readability_score( text) self.flesch_reading_ease = textstat.flesch_reading_ease(text) self.gunning_fog = textstat.gunning_fog(text) self.linsear_write_formula = textstat.linsear_write_formula(text) self.lix = textstat.lix(text) self.rix = textstat.rix(text) self.smog_index = textstat.smog_index(text) self.text_standard = textstat.text_standard(text)
def get_readability_stats(text): return { 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'smog_index': textstat.smog_index(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'dale_chall_readability_score': textstat.dale_chall_readability_score(text), 'linsear_write_formula': textstat.linsear_write_formula(text), 'gunning_fog': textstat.gunning_fog(text), 'text_standard': textstat.text_standard(text, float_output=True), }
def analyze(fileName): # convert to one long, massive string with spaces so algo can find words input_file = Path(fileName).read_text(encoding='utf8') input_file = input_file.replace('\n', '') input_file = input_file.replace('*', '') found = re.findall(r'\[(.*?)\]', input_file) input_file = input_file.replace('[', '') input_file = input_file.replace(']', '') i = 0 while i < len(found): input_file = input_file.replace(found[i], '') i += 1 print('Reading ease is ' + str(textstat.flesch_reading_ease(input_file))) print('Score is ' + str(textstat.flesch_kincaid_grade(input_file))) #outF.write(fileName[51:] + ': ' + str(textstat.flesch_reading_ease(input_file)) + ' ' + str(textstat.flesch_kincaid_grade(input_file)) + '\n') outF.write(fileName[105:] + ' ' + str(textstat.flesch_reading_ease(input_file)) + ' ' + str(textstat.flesch_kincaid_grade(input_file)) + '\n') # 417 political news articles from BBC # 386 entertainment news articles from BBC
def create_readability_plots_per_episode(file_list): count = 0 file_name_list = [] readability_list = [] for file in file_list: """Create Dataframe""" df_individual = create_dataframe(file) all_speech = " ".join(df_individual['Line'].as_matrix()) readability_list.append(textstat.flesch_kincaid_grade(all_speech)) # print(df_individual) """Combine all dfs into one big dataframe""" if count == 0: df = df_individual count+=1 else: df = pd.concat([df, df_individual]) print(readability_list) end = len(readability_list) + 1 x = list(range(1, end)) sns.barplot( x=x, y = readability_list) sns.plt.show()
def test_flesch_kincaid_grade(): score = textstat.flesch_kincaid_grade(long_test) assert score == 10.0
main_characters_TOS = ['KIRK', 'SPOCK', 'MCCOY', 'SCOTT', 'UHURA', 'SULU', 'CHEKOV', 'CHAPEL'] # main_characters_TOS = ['KIRK', 'SPOCK', 'MCCOY', 'SCOTT', 'UHURUA', 'SULU', 'CHEKOV', 'CHAPEL', 'RAND', 'SAAVIK', 'SAREK'] main_characters_TNG = ['PICARD', 'RIKER', 'TROI', 'CRUSHER', 'WESLEY', 'DATA', 'LAFORGE', 'WORF', 'PULASKI', 'TASHA', "O'BRIEN", 'BARCLAY', 'RO', 'GUINAN', 'KEIKO', 'LWAXANA', 'Q', 'GOWRON'] # main_characters_TNG = ['PICARD', 'RIKER', 'TROI', 'CRUSHER', 'WESLEY', 'DATA', 'LAFORGE', 'WORF', 'PULASKI', 'TASHA', "O'BRIEN", 'BARCLAY', 'OGAWA', 'RO', 'GUINAN', 'KEIKO', 'LWAXANA', 'Q', 'GOWRON', 'ALEXANDER', 'KURN', 'LURSA', "B'ETOR", 'LORE', "K'EHLEYR", 'TRAVELLER', 'VASH', 'TOMALAK', 'NECHAYEV', 'LEFLER', 'MOT'] main_characters_DS9 = ['SISKO', 'KIRA', 'ODO', 'DAX', 'BASHIR', 'EZRI', 'QUARK', 'JAKE', 'ROM', 'NOG', 'LEETA', 'GARAK', 'KASIDY', 'MARTOK', 'ZIYAL', 'DUKAT', 'WINN', 'WEYOUN', 'FEMALE', 'DAMAR','VIC', 'ZEK', 'BAREIL', 'ISHKA', 'SLOAN', 'OPAKA', 'BRUNT','SHAKAAR'] # main_characters_DS9 = ['SISKO', 'KIRA', 'ODO', 'DAX', 'BASHIR', 'EZRI', 'QUARK', 'JAKE', 'ROM', 'NOG', 'LEETA', 'GARAK', 'KASIDY', 'MARTOK', 'ZIYAL', 'DUKAT', 'ROSS', 'WINN', 'WEYOUN', 'FEMALE', 'DAMAR', 'EDDINGTON', 'VIC', 'ZEK', 'BAREIL', 'ISHKA', 'SLOAN', 'TAIN', 'OPAKA', 'BRUNT', 'JOSEPH', 'SHAKAAR', 'MORA', 'KOR', 'EVEK', 'CRETAK', 'MILA'] main_characters_Voyager = ['JANEWAY', 'CHAKOTAY', 'TUVOK', 'PARIS', 'TORRES', 'KIM', 'SEVEN', 'EMH', 'NEELIX', 'KES', 'SESKA', 'QUEEN'] main_characters_Enterprise = ['ARCHER', "T'POL", 'TUCKER', 'REED', 'TRAVIS', 'HOSHI', 'PHLOX','FORREST', 'SOVAL', 'DANIELS'] main_chars_all = main_characters_TOS + main_characters_TNG + main_characters_DS9 + main_characters_Voyager + main_characters_Enterprise for char in main_chars_all: try: speech = " ".join((df[df['Character']==char])['Line'].as_matrix()) print(char, ':', textstat.flesch_kincaid_grade(speech)) except: continue # """remove puncuation from lines (after we remove stuff in parentheses)""" # # df_cleaner = remove_punctuation(df_clean) # df['Line'] = df['Line'].apply(clean_line) # df['Character'] = df['Character'].apply(clean_character) # df['Location'] = df['Location'].apply(clean_location) # for # print(df)
import seaborn as sns """Can create graphs of readability across entire seasons or whole series""" file_list = sys.argv[1:] count = 0 file_name_list = [] readability_list = [] for file in file_list: """Create Dataframe""" df_individual = create_df.create_dataframe(file) # df_individual = create_dataframe(file) all_speech = " ".join(df_individual['Line'].as_matrix()) readability_list.append(textstat.flesch_kincaid_grade(all_speech)) # print(df_individual) """Combine all dfs into one big dataframe""" if count == 0: df = df_individual count+=1 else: df = pd.concat([df, df_individual]) # print(readability_list) end = len(readability_list) + 1 x = list(range(1, end)) sns.barplot( x=x, y = readability_list) sns.plt.savefig('figures/Animated_readability_per_episode')