def fkBLEU_file(source, preds, refs, preprocess=as_is, pass_indiv=False): files = [codecs.open(fis, "r", 'utf-8') for fis in [source, preds, refs]] scores = [] references = [] hypothese = [] fkbleu = 0 n = 0 for src, pred, ref in izip(*files): references = [word_tokenize(preprocess(r)) for r in ref.split('\t')] hypothese = word_tokenize(preprocess(pred)) source = word_tokenize(preprocess(src)) ibleu = 0.9 * corpus_bleu( [references], [hypothese], smoothing_function=smooth.method3) - 0.1 * corpus_bleu( [source], [hypothese], smoothing_function=smooth.method3) try: fkdiff = textstat.flesch_reading_ease( ' '.join(hypothese)) - textstat.flesch_reading_ease( ' '.join(source)) n += 1 fkdiff = 1 / (1 + np.exp(-fkdiff)) fkbleu += fkdiff * ibleu except Exception: continue fkbleu /= n for fis in files: fis.close() # Smoothing method 3: NIST geometric sequence smoothing return ibleu
def complexityAlongtheText( f, chunk_length = 100 ): text = loadText(f) words = text.split() x = [] y = [] cur = 0 average = textstat.flesch_reading_ease(text) while ( cur < len(words) ): sub = words[cur:cur+chunk_length] sub.append('.') sub_text = ' '.join(sub) y.append( 100 - textstat.flesch_reading_ease(sub_text) ) x.append( cur) cur += chunk_length if average < 20: col = colours[4] elif average < 40: col = colours[6] elif average < 60: col = colours[3] elif average < 80: col = colours[1] else: col = colours[0] plt.plot(x,y, color = [ 1.0 / 255.0 * c for c in col], alpha = 0.6, linewidth = 5) plt.fill_between(x,y, color = [ 1.0 / 255.0 * c for c in col], alpha = 0.3) # plt.plot( [0,max(x)], [average,average], color = 'gray') plt.ylim([0,100]) plt.xlabel("number of words") plt.ylabel("difficulty") plt.show()
def complexityAlongtheText(text, n_chunk=10): words = text.split() chunk_length = len(words) / n_chunk if chunk_length < 200: chunk_length = 200 chunk_length = int(chunk_length) x = [] y = [] cur = 0 # average = textstat.flesch_reading_ease(text) while (cur < len(words)): sub = words[cur:cur + chunk_length] sub.append('.') sub_text = ' '.join(sub) try: diff = 100 - textstat.flesch_reading_ease(sub_text) if diff < 100: y.append(100 - textstat.flesch_reading_ease(sub_text)) x.append(cur) except: print "cannot compute complexity in 'complexityAlongtheText' " cur += chunk_length if len(y): average = float(sum(y)) / float(len(y)) else: average = 0 # print "average reading ease: %s "%average if average < 20: col = colours_ordered[0] elif average < 40: col = colours_ordered[1] elif average < 60: col = colours_ordered[2] elif average < 80: col = colours_ordered[3] else: col = colours_ordered[4] full_data = dict() data = [] for i in range(0, len(y)): tmp = dict() tmp['x'] = x[i] tmp['y'] = y[i] # tmp['color'] = col data.append(tmp) full_data['values'] = data full_data['color'] = col # plt.plot( [0,max(x)], [average,average], color = 'gray') return full_data
def sentence_stats(s1, s2): #s2 should be predictions and s1 should be source try: fkdiff = textstat.flesch_reading_ease( s2) - textstat.flesch_reading_ease(s1) except Exception: fkdiff = 0.0 doc1 = nlp(s1) doc2 = nlp(s2) ts = tree_sim(doc1, doc2) / 100 ds = doc_sim(doc1, doc2) return (torch.FloatTensor([fkdiff, ts, ds]))
def readability(text, file): fog = textstat.gunning_fog(text) fres = textstat.flesch_reading_ease(text) fkgl = textstat.flesch_kincaid_grade(text) file.write( '\nGunning Fog Index: %d \nFlesch Reading Ease: %d \nFlesch-Kincaid Grade: %d' % (fog, fres, fkgl))
def _calculate_scores(self, docs): docs_scores = [] for doc in docs: scores = {} scores['chars'] = ts.char_count(doc) scores['words'] = ts.lexicon_count(doc) scores['sents'] = ts.sentence_count(doc) #scores['syllables'] = ts.syllable_count(doc) scores['avg_sent_length'] = ts.avg_sentence_length(doc) scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc) scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc) scores['flesch'] = ts.flesch_reading_ease(doc) #scores['smog'] = ts.smog_index(doc) #scores['coleman_liau'] = ts.coleman_liau_index(doc) scores['automated_readability'] = ts.automated_readability_index( doc) #scores['linsear'] = ts.linsear_write_formula(doc) #scores['difficult_words'] = ts.difficult_words(doc) scores['dale_chall'] = ts.dale_chall_readability_score(doc) #scores['gunning_fog'] = ts.gunning_fog(doc) scores['lix'] = ts.lix(doc) docs_scores.append(scores) return docs_scores
def main(): csv_file2 = open(sys.argv[2], 'w', encoding="utf8") writer = csv.writer(csv_file2, delimiter=',') doc_id = 1 writer.writerow(["ID", "URL", "text", "impact-score", "readability", "grade-level", "smog-index", "total-words", "total-sentences"]) with open(sys.argv[1], 'r', encoding="utf8", errors='ignore') as csv_file1: reader = csv.reader(csv_file1) # Skip the first line with headers next(reader) for row in reader: impact = str(row[0]) url = str(row[1]) text = str(row[2]) read_ease = textstat.flesch_reading_ease(text) grade = textstat.flesch_kincaid_grade(text) smog = textstat.smog_index(text) words = textstat.lexicon_count(text) sentences = textstat.sentence_count(text) # Uncomment this if we want summary and key words # summary = summarize(text, ratio=0.3) # key_words = keywords(text, ratio=0.3) writer.writerow([doc_id]+[url]+[text]+[impact]+[read_ease]+[grade]+[smog]+[words]+[sentences]) doc_id = doc_id+1 csv_file1.close() csv_file2.close() print('Summary statistics complete!')
def get_special_metrics(text): blob = TextBlob(text) main = { "statistics": { "syllables": textstat.syllable_count(text), "words": textstat.lexicon_count(text), "characters": textstat.char_count(text), "polysyllables": textstat.polysyllabcount(text), "average letter per word": textstat.avg_letter_per_word(text), "average sentence length": textstat.avg_sentence_length(text), "average sentence per word": textstat.avg_sentence_per_word(text), "sentences": textstat.sentence_count(text), }, "difficulty": { "flesch reading ease": textstat.flesch_reading_ease(text), "smog index": textstat.smog_index(text), "flesch kincaid grade": textstat.flesch_kincaid_grade(text), "coleman liau index": textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), "gunning fog": textstat.gunning_fog(text), }, "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity}, } return main
def readability_analysis(self, text): words = text.split() wrd_dic = {} for wrd in words: wrd = "".join(a for a in wrd if a not in punctuation) wrd_dic[wrd] = textstat.syllable_count(wrd) wrd_dic = [b for b in wrd_dic if wrd_dic[b] >= 5] flesch_reading_ease = textstat.flesch_reading_ease(text) if flesch_reading_ease > 100: flesch_reading_ease = 100 elif flesch_reading_ease < 0: flesch_reading_ease = 0 syllable_count = textstat.syllable_count(text) avg_syllables_per_word = textstat.avg_syllables_per_word(text) avg_letter_per_word = textstat.avg_letter_per_word(text) readability = { "flesch_reading_ease": flesch_reading_ease, "avg_syllables_per_word": avg_syllables_per_word, "syllable_count": syllable_count, "avg_letter_per_word": avg_letter_per_word, } grade, score = self.readability_grade(readability) readability['grade'] = grade readability['score'] = score readability['difficult_words'] = wrd_dic return readability
def _get_reading_stats(no_code_text): """ Returns reading level information :param no_code_text: String to analyse :return: list of details """ group_by = 'Reading Level Analysis ' results = [] results.append(TextFeature('Flesch Reading Ease', textstat.flesch_reading_ease(no_code_text), group_by)) # higher is better, scale 0 to 100 results.append(TextFeature('Flesch-Kincaid Grade Level', textstat.flesch_kincaid_grade(no_code_text), group_by)) try: results.append(TextFeature('The Fog Scale (Gunning FOG formula)', textstat.gunning_fog(no_code_text), group_by)) except IndexError: # Not sure why, but this test throws this error sometimes results.append(TextFeature('The Fog Scale (Gunning FOG formula)', "Undetermined", group_by)) try: results.append(TextFeature('The SMOG Index', textstat.smog_index(no_code_text), group_by)) except IndexError: # Not sure why, but this test throws this error sometimes results.append(TextFeature('The SMOG Index', "Undetermined", group_by)) results.append(TextFeature('Automated Readability Index', textstat.automated_readability_index(no_code_text), group_by)) results.append(TextFeature('The Coleman-Liau Index', textstat.coleman_liau_index(no_code_text), group_by)) try: results.append(TextFeature('Linsear Write Formula', textstat.linsear_write_formula(no_code_text), group_by)) except IndexError: results.append(TextFeature('Linsear Write Formula', "Undetermined", group_by)) try: results.append(TextFeature('Dale Chall Readability Score', textstat.dale_chall_readability_score(no_code_text), group_by)) except IndexError: # Not sure why, but this test throws this error sometimes results.append(TextFeature('Dale Chall Readability Score', "Undetermined", group_by)) try: results.append(TextFeature('Readability Consensus', textstat.readability_consensus(no_code_text), group_by)) except (TypeError, IndexError): results.append(TextFeature('Readability Consensus', "Undetermined; One of the tests above failed.", group_by)) return results
def readability(text): print("Readability\n=================================\n\n") print("Flesch Reading Ease\n________________________\n\n") print str(textstat.flesch_reading_ease(text)) + "\n" print("Smog Index\n________________________\n\n") print str(textstat.smog_index(text)) + "\n" print("Flesch Kincaid Grade\n________________________\n\n") print str(textstat.flesch_kincaid_grade(text)) + "\n" print("Coleman Liau Index\n________________________\n\n") print str(textstat.coleman_liau_index(text)) + "\n" print("ARI\n________________________\n\n") print str(textstat.automated_readability_index(text)) + "\n" print("Dale Chall\n________________________\n\n") print str(textstat.dale_chall_readability_score(text)) + "\n" print("Difficult Words\n________________________\n\n") print str(textstat.difficult_words(text)) + "\n" print("Linsear Write Formula\n________________________\n\n") print str(textstat.linsear_write_formula(text)) + "\n" print("Gunning Fog\n________________________\n\n") print str(textstat.gunning_fog(text)) + "\n" print "Compiled Score\n_____________________________\n\n" print str(textstat.text_standard(text)) + "\n" return len(adjectives)
def main(): """ Evaluate and print Readability scores """ if len(sys.argv) > 1: inf = open(sys.argv[1], 'r') else: sys.stderr.write('Error: specify input file.\n') sys.exit() text = inf.read() inf.close() lexcount = textstat.lexicon_count(text) sys.stdout.write('Lexicon count: {0:d}\n'.format(lexcount)) # reading time in minutes # assumes 180 WPM plus some offset tread = (lexcount + 250) / 180. sys.stdout.write('Estimating reading time: {0:1.1f} minutes.\n'.format(tread)) ease = textstat.flesch_reading_ease(text) grade = textstat.flesch_kincaid_grade(text) sys.stdout.write('Flesch reading ease score: {0:1.1f}\n'.format(ease)) sys.stdout.write('Flesch-Kincaid grade: {0:1.1f}\n'.format(grade))
def text_analytics(text): if textstat.sentence_count(text) != 0: lexicon = textstat.lexicon_count(text) #word count sent = textstat.sentence_count(text) #sentence count syll = textstat.syllable_count(text) #syllable count flesch = textstat.flesch_reading_ease(text) #flesch score smog = textstat.smog_index(text) #SMOG index fog = textstat.gunning_fog(text) #FOG index dale = textstat.dale_chall_readability_score(text) #grade level ari = textstat.automated_readability_index(text) #grade level cl = textstat.coleman_liau_index(text) #grade level flesch1 = lexicon*flesch flesch2 = sent*flesch flesch3 = syll*flesch smog1 = lexicon*smog smog2 = sent*smog smog3 = syll*smog fog1 = lexicon*fog fog2 = sent*fog fog3 = syll*fog dale1 = lexicon*dale dale2 = sent*dale dale3=syll*dale ari1 = lexicon*ari ari2 = sent*ari ari3 = syll*ari cl1 = lexicon*cl cl2 = sent*cl cl3 = syll*cl x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1, smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3] return(x)
def save_page(request): #Smash through the request and get the url, desc & folder url = request.GET.get('url') desc = request.GET.get('desc') title = request.GET.get('title') folder_name = request.GET.get('folder_name') #Score the page using APIs r = requests.get(url) #Open the url, read the contents then score them. Seems to be slowing down the app quite a bit. myfile = BeautifulSoup(r.text,"html.parser").text blob = TextBlob(myfile) scr_polarity = "{:.2f}".format( blob.sentiment.polarity) scr_subjectivity = "{:.2f}".format( blob.sentiment.subjectivity) scr_readability ="{:.2f}".format(textstat.flesch_reading_ease(myfile)) #Save page user_profile = UserProfile.objects.all().get(user=request.user) folder = Folder.objects.all().get(name=folder_name,user=user_profile) P = Page.objects.get_or_create(title=title, url=url,summary=desc, readability_score=scr_readability, objectivity_score=scr_subjectivity, sentimentality_score=scr_polarity, folder=folder) #Return success/fail return HttpResponse('Was a success')
def get_special_metrics(text): blob = TextBlob(text) main = { 'statistics': { 'syllables': textstat.syllable_count(text), 'words': textstat.lexicon_count(text), 'characters': textstat.char_count(text), 'polysyllables': textstat.polysyllabcount(text), 'average letter per word': textstat.avg_letter_per_word(text), 'average sentence length': textstat.avg_sentence_length(text), 'average sentence per word': textstat.avg_sentence_per_word(text), 'sentences': textstat.sentence_count(text) }, 'difficulty': { 'flesch reading ease': textstat.flesch_reading_ease(text), 'smog index': textstat.smog_index(text), 'flesch kincaid grade': textstat.flesch_kincaid_grade(text), 'coleman liau index': textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), 'gunning fog': textstat.gunning_fog(text) }, 'sentiments': { 'polarity': blob.sentiment.polarity, 'subjectivity': blob.sentiment.subjectivity } } return main
def getFeatures(files): allFileData = [] allScores = [] allFres = [] df = files rowNum = df.shape[0] for i in range(rowNum): FileData = [] review = (df.iloc[i].reviewText).strip().split() summary = (df.iloc[i].summary).strip().split() row_data = np.append(review, summary) score = float(df.iloc[i].overall) fres = textstat.flesch_reading_ease(str(row_data)) lor = lengthOfReview(str(row_data)) sc = sentenceCount(str(row_data)) cc = charCount(str(row_data)) acc = allCapCount(str(row_data)) qc = questionCount(str(row_data)) review = [element.lower() for element in review] FileData.append(lor) FileData.append(sc) FileData.append(cc) FileData.append(acc) FileData.append(qc) FileData.append(score) allFileData.append(FileData) allScores.append(score) allFres.append(fres) return allFileData, allScores, allFres
def compareContents(): if request.method == "POST": line = request.form['poem'] poem1 = request.form['poem1'] #---------Metrics comparison logic goes here. keep them in session attributes-----------------------# session['line'] = line #print("i am in row : ",row) #print "Tagline :", line #print("no of words= ",len(line.split())) #line1 = line.lstrip('0123456789.- ,') #print "flesch_reading_ease = ",textstat.flesch_reading_ease(line) fre = textstat.flesch_reading_ease(line) session['fre'] = fre #print "smog_index = ",textstat.smog_index(line) smog = textstat.smog_index(line) session['smog'] = smog #print "flesch_kincaid_grade = ",textstat.flesch_kincaid_grade(line) fkg = textstat.flesch_kincaid_grade(line) session['fkg'] = fkg #print "dale_chall_readability_score = ", textstat.dale_chall_readability_score(line) dcr = textstat.dale_chall_readability_score(line) session['dcr'] = dcr #print "gunning_fog = ",textstat.gunning_fog(line) gf = textstat.gunning_fog(line) session['gf'] = gf metrics = True return render_template('compareContents.html',metrics=metrics, line=line, fre=fre, smog=smog, fkg=fkg, dcr=dcr,gf=gf) return render_template('compareContents.html')
def other_features(tweet): ##SENTIMENT sentiment = VS(tweet) ##READABILITY #See https://pypi.python.org/pypi/textstat/ flesch = round(textstat.flesch_reading_ease(tweet), 3) flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet), 3) gunning_fog = round(textstat.gunning_fog(tweet), 3) ##TEXT-BASED length = len(tweet) num_terms = len(tweet.split()) ##TWITTER SPECIFIC TEXT FEATURES hashtag_count = tweet.count("#") mention_count = tweet.count("@") url_count = tweet.count("http") retweet = 0 if tweet.lower().startswith("rt") is True: retweet = 1 #Checking if RT is in the tweet words = tweet.lower().split() if "rt" in words or "#rt" in words: retweet = 1 features = [ sentiment['compound'], flesch, flesch_kincaid, gunning_fog, length, num_terms, hashtag_count, mention_count, url_count, retweet ] return features
def getTopicsDistributionWithinTheText(path, chunk_length=300): global_scores = topicsFromTokens(tokenize(loadText(path))) text = loadText(path) words = text.split() average = textstat.flesch_reading_ease(text) scores = dict() for i in sorted(global_scores, key=lambda tup: tup[1], reverse=True): if i[1] > min_score: scores[i[0]] = [] x = [] y = [] cur = 0 i = 1 while (cur < len(words)): sub = words[cur:cur + chunk_length] sub.append('.') sub_text = ' '.join(sub) cur += chunk_length bow = lda.id2word.doc2bow(raw_tokenize(sub_text)) score = lda.get_document_topics(bow) for s in score: if s[0] in scores.keys(): scores[s[0]].append(s[1]) for s in scores: if len(scores[s]) < i: scores[s].append(0) i += 1 return scores, global_scores
def other_features(tweet): ##SENTIMENT sentiment = VS(tweet) ##READABILITY #See https://pypi.python.org/pypi/textstat/ flesch = round(textstat.flesch_reading_ease(tweet),3) flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet),3) gunning_fog = round(textstat.gunning_fog(tweet),3) ##TEXT-BASED length = len(tweet) num_terms = len(tweet.split()) ##TWITTER SPECIFIC TEXT FEATURES hashtag_count = tweet.count("#") mention_count = tweet.count("@") url_count = tweet.count("http") retweet = 0 if tweet.lower().startswith("rt") is True: retweet = 1 #Checking if RT is in the tweet words = tweet.lower().split() if "rt" in words or "#rt" in words: retweet = 1 features = [sentiment['compound'],flesch, flesch_kincaid, gunning_fog, length, num_terms, hashtag_count, mention_count, url_count, retweet] return features
def f(): print("hello") book = xlwt.Workbook() worksheet = book.add_sheet('ReadabilityScore') worksheet.write(0, 0, "Gen_sent") worksheet.write(0, 1, "flesch_reading_ease") worksheet.write(0, 2, "flesch_kincaid_grade") worksheet.write(0, 3, "dale_chall_readability_score") worksheet.write(0, 4, "gunning_fog") f = open('abc.txt') #, encoding='utf-8') row = 1 for line in iter(f): #print("i am in row : ",row) #print "Tagline :", line worksheet.write(row, 0, line) #print("no of words= ",len(line.split())) #line1 = line.lstrip('0123456789.- ,') #print "flesch_reading_ease = ",textstat.flesch_reading_ease(line) fre = textstat.flesch_reading_ease(line) worksheet.write(row, 1, fre) #print "smog_index = ",textstat.smog_index(line) smog = textstat.smog_index(line) #print "flesch_kincaid_grade = ",textstat.flesch_kincaid_grade(line) fkg = textstat.flesch_kincaid_grade(line) worksheet.write(row, 2, fkg) #print "dale_chall_readability_score = ", textstat.dale_chall_readability_score(line) dcr = textstat.dale_chall_readability_score(line) worksheet.write(row, 3, dcr) #print "gunning_fog = ",textstat.gunning_fog(line) gf = textstat.gunning_fog(line) worksheet.write(row, 4, gf) row += 1 book.save('Readability_Scores.xls')
def process(url): html = urllib.urlopen(url).read() soup = BeautifulSoup(html, "html.parser") # kill all script and style elements for script in soup(["script", "style"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines text = '\n'.join(chunk for chunk in chunks if chunk) blob = TextBlob(text) sent = blob.sentiment subj = 100 - int((sent.subjectivity)*100) #the less subjective, the better polar = 100 - int((sent.polarity)*100) #the less polar, the better readability = int(textstat.flesch_reading_ease(text)) return subj, polar, readability
def do_text_stats(self, text): ### Syllable Count syllable_count = textstat.syllable_count(text) ### Lexicon Count lexicon_count = textstat.lexicon_count(text, True) ### Sentence Count sentence_count = textstat.sentence_count(text) ### The Flesch Reading Ease formula try: flesch_reading_ease = textstat.flesch_reading_ease(text) except TypeError as e: flesch_reading_ease = None #* 90-100 : Very Easy #* 80-89 : Easy #* 70-79 : Fairly Easy #* 60-69 : Standard #* 50-59 : Fairly Difficult #* 30-49 : Difficult #* 0-29 : Very Confusing ### The The Flesch-Kincaid Grade Level try: flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) except TypeError as e: flesch_kincaid_grade = None ## The Fog Scale (Gunning FOG Formula) gunning_fog = textstat.gunning_fog(text) ### The SMOG Index smog_index = textstat.smog_index(text) ### Automated Readability Index automated_readability_index = textstat.automated_readability_index( text) ### The Coleman-Liau Index try: coleman_liau_index = textstat.coleman_liau_index(text) except TypeError as e: coleman_liau_index = None ### Linsear Write Formula linsear_write_formula = textstat.linsear_write_formula(text) ### Dale-Chall Readability Score dale_chall_readability_score = textstat.dale_chall_readability_score( text) ### Readability Consensus based upon all the above tests try: text_standard = textstat.text_standard(text) except TypeError as e: text_standard = None return { "syllable_count": syllable_count, "lexicon_count": lexicon_count, "sentence_count": sentence_count, "flesch_reading_ease": flesch_reading_ease, "flesch_kincaid_grade": flesch_kincaid_grade, "gunning_fog": gunning_fog, "smog_index": smog_index, "automated_readability_index": automated_readability_index, "coleman_liau_index": coleman_liau_index, "linsear_write_formula": linsear_write_formula, "dale_chall_readability_score": dale_chall_readability_score, "text_standard": text_standard }
def get_readability(df2): df = df2.copy() text_feats = df.select_dtypes(include=['object']).columns.values for i, col in enumerate(text_feats): df['flesch_reading_ease{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_reading_ease(x)) df['smog_index{}'.format(i)] = df[col].apply( lambda x: textstat.smog_index(x)) df['flesch_kincaid_grade{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_kincaid_grade(x)) df['coleman_liau_index{}'.format(i)] = df[col].apply( lambda x: textstat.coleman_liau_index(x)) df['automated_readability_index{}'.format(i)] = df[col].apply( lambda x: textstat.automated_readability_index(x)) df['dale_chall_readability_score{}'.format(i)] = df[col].apply( lambda x: textstat.dale_chall_readability_score(x)) df['difficult_words{}'.format(i)] = df[col].apply( lambda x: textstat.difficult_words(x)) df['linsear_write_formula{}'.format(i)] = df[col].apply( lambda x: textstat.linsear_write_formula(x)) df['gunning_fog{}'.format(i)] = df[col].apply( lambda x: textstat.gunning_fog(x)) df['text_standard{}'.format(i)] = df[col].apply( lambda x: textstat.text_standard(x)) return df
def flesch_reading_ease_score(): tokenizer = RegexpTokenizer(r'\w+') final_flesch_reading_ease_score = [] for index, row in df.iterrows(): valid_words = [] body_only = re.sub('<code>[^>]+</code>', '', row['Body']) soup = BeautifulSoup(body_only, "lxml") word_tokens = tokenizer.tokenize(soup.text) for word in word_tokens: if not_punctuation(word): valid_words.append(word) word_count = len(valid_words) tag_removed_text = soup.text tag_removed_text = tag_removed_text.replace("\n", "") if word_count != 0: flesch_reading_ease_score = textstat.flesch_reading_ease( tag_removed_text) else: flesch_reading_ease_score = 0 print "flesch_reading_ease_score of ", index, " - ", flesch_reading_ease_score final_flesch_reading_ease_score.append(flesch_reading_ease_score) df['BodyFleschReadingEaseLevel'] = final_flesch_reading_ease_score df.to_csv("combined.csv")
def flesch_score(text): try: if text == "": return 0 return textstat.flesch_reading_ease(text) except: return 0
def print_readability_metrics(text, file_name): print(file_name, " readability metrics") print("flesch reading ease: ", textstat.flesch_reading_ease(text)) print("dale chall readability: ", textstat.dale_chall_readability_score(text)) print("smog index: ", textstat.smog_index(text)) print('------------------------------------------------')
def get_text_features(article_contents: str) -> dict: """ Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates other factors such as the number of typos. @param article_contents, a string which contains the contents of an article @return language_analysis_dict, a dictionary which contains """ tool = language_check.LanguageTool('en-US') language_analysis_dict = { "flesch_reading": textstat.flesch_reading_ease(article_contents), "flesch_kincaid": textstat.flesch_kincaid_grade(article_contents), "coleman_liau": textstat.coleman_liau_index(article_contents), "typos_to_words": len(tool.check(article_contents)) / textstat.lexicon_count(article_contents), "percent_difficult_words": textstat.difficult_words(article_contents) / textstat.lexicon_count(article_contents), } return language_analysis_dict
def calculate_statistics(lyrics): """ Calculates statistics based on the text_raw of the lyrics. :return: Annotated lyrics containing information about the songs """ logging.info("Calculating Statistics") from textstat.textstat import textstat for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)): try: song["num_syllables"] = textstat.syllable_count(song["text_raw"]) song["num_words"] = textstat.lexicon_count(song["text_raw"]) song["num_sentences"] = textstat.sentence_count(song["text_raw"]) song["flesch_score"] = textstat.flesch_reading_ease( song["text_raw"]) song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade( song["text_raw"]) song["fog_score"] = textstat.gunning_fog(song["text_raw"]) song[ "num_difficult_words"] = textstat.dale_chall_readability_score( song["text_raw"]) except Exception as e: logging.error( "Something bad happened in the current song ! Skipping it... \n{}" .format(song)) logging.exception(e) return lyrics
def getTopicsDistributionWithinTheText(path, chunk_length = 300 ): global_scores = topicsFromTokens(tokenize(loadText(path))) text = loadText(path) words = text.split() average = textstat.flesch_reading_ease(text) scores = dict() for i in sorted(global_scores, key=lambda tup: tup[1], reverse = True): if i[1] > min_score: scores[i[0]] = [] x = [] y = [] cur = 0 i = 1 while ( cur < len(words) ): sub = words[cur:cur+chunk_length] sub.append('.') sub_text = ' '.join(sub) cur += chunk_length bow = lda.id2word.doc2bow(raw_tokenize(sub_text)) score = lda.get_document_topics(bow) for s in score: if s[0] in scores.keys(): scores[s[0]].append(s[1]) for s in scores: if len(scores[s]) < i: scores[s].append(0) i += 1 return scores, global_scores
def analyse_plain_text(test_data): text_stats = TextStats() # Do some simple analysis. from textblob import TextBlob zen = TextBlob(test_data) text_stats.word_count = len(zen.words) text_stats.sentence_count = len(zen.sentences) text_stats.polarity = zen.sentiment.polarity text_stats.subjectivity = zen.sentiment.subjectivity # Easy to read, this? from textstat.textstat import textstat text_stats.flesch_reading_ease = textstat.flesch_reading_ease(test_data) # Words per sentence count. from textstat.textstat import textstat text_stats.word_per_sentence_count = ( textstat.lexicon_count(test_data, False) / textstat.sentence_count(test_data)) # Convert all to lower. test_data = test_data.lower() # Tokenise. from nltk.tokenize import word_tokenize words = word_tokenize(test_data) # Tokenise stemmed text. from nltk.stem import PorterStemmer ps = PorterStemmer() test_data_stemmed = '' for w in words: test_data_stemmed = test_data_stemmed + ' ' + ps.stem(w) stemmed_words = word_tokenize(test_data_stemmed) # Remove non-words. nonPunct = re.compile('.*[A-Za-z0-9].*') # must contain a letter or digit filtered = [w for w in stemmed_words if nonPunct.match(w)] # Remove stopwords: from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) extra_stopwords = set([ 'that', '\'s', 'wa', 'thi', 'like', 'n\'t', 'would', 'ha', 'us', 'get' ]) filtered = [ w for w in filtered if w not in stopwords and w not in extra_stopwords ] # How many unique words? from collections import Counter counts = Counter(filtered) text_stats.unique_word_count = len(counts) # Words sorted by most common. text_stats.counts = counts return text_stats
def calculateScores(text): text = smart_bytes(text,encoding="utf-8",strings_only=False,errors="replace") temp = TextBlob(text) toReturn = {} toReturn["readability_score"] = textstat.flesch_reading_ease(text) toReturn["subjectivity_score"] = temp.sentiment.subjectivity * 100 toReturn["sentiment_score"] = (temp.sentiment.polarity + 1) * 50 return toReturn
def flesch_reading_ease(text): """ :type text: Text :param text: The text to be analysed :rtype float :returns Flesch Reading Ease Score """ return textstat.flesch_reading_ease(text.text)
def get_sensitivity_rating(text): blob = TextBlob(text) d = { "sentimentality":blob.subjectivity * 100, "readability":textstat.flesch_reading_ease(text)} # subjectivity is a float between 0 and 1, multiplied it by 100 so it's percent # The overall sensitivity of teh article is the readability minus the subjectivity d["sensitivity"] = (d["readability"] - d["sentimentality"]) return d
def textConfidence(fname): with PyTessBaseAPI() as api: #for image in images: api.SetImageFile(fname) text = api.GetUTF8Text() #print api.AllWordConfidences() print textstat.flesch_kincaid_grade(text) print textstat.flesch_reading_ease(text) print("90-100 : Very Easy") print("80-89 : Easy") print("70-79 : Fairly Easy") print("60-69 : Standard") print("50-59 : Fairly Difficult") print("30-49 : Difficult") print("0-29 : Very Confusing")
def get_readme_flesch_reading_ease(self): """Calculates the Flesch Reading Ease level of the repository's readme, with 100 being easiest to read and 0 being hardest, using textstat. """ if self.readme: return textstat.flesch_reading_ease(self.readme) else: return None
def get_reading_score(self): """ Takes the page copy and return the flesch reading age of the page Also checking whether the page text has more than 0 characters as it will return an error """ if len(self.page_text) > 10: self.reading_score = textstat.flesch_reading_ease(self.page_text) else: self.reading_score = 0
def _get_readability_score(text): try: readability_score = int(textstat.flesch_reading_ease(text)) #concerts (-100,100) to (0,100) readability_score = 50 + readability_score / 2 return readability_score except: print '_get_readability_score' return 0
def trygetreadingease(extract): # Avoid spamming warnings for the common case of zero-length extracts: if len(extract.strip()) == 0: return None try: return textstat.flesch_reading_ease(extract) except Exception as e: logging.warning("Can't get FRES for %s: %s" % (extract, e)) return None
def run_query(search_terms): # Construct the latter part of our request's URL. # Sets the format of the response to JSON and sets other properties. search_url = "http://healthfinder.gov/developer/Search.json?api_key="+ HEALTHFINDER_API_KEY + "&keyword="+search_terms # Create our results list which we'll populate. results = [] try: # Connect to the server and read the response generated. response = urllib2.urlopen(search_url).read() # Convert the string response to a Python dictionary object. json_response = json.loads(response) if 'Tools' not in json_response["Result"]: return [] # Loop through each page returned, populating out results list. for result in json_response["Result"]["Tools"]: try: blob = TextBlob(result['Contents']) for sentence in blob.sentences: polarity_score = sentence.sentiment.polarity subjectivity_score = sentence.sentiment.subjectivity url = "" if type(result['MoreInfo'])== list: url = result['MoreInfo'][0]['Url'] else: url = result['MoreInfo']['Url'] if url.endswith('/'): url = url[:-1] adder = "" if len(result['Contents']) > 400: adder = "..." results.append({ 'title': result['Title'], 'link': url, 'summary': result['Contents'][:400] + adder, 'flesch_score': '{0:.2f}'.format(textstat.flesch_reading_ease(result['Contents'])), 'polarity_score': '{0:.2f}'.format(polarity_score), 'subjectivity_score': '{0:.2f}'.format(subjectivity_score), 'source':'HealthFinder'}) except: continue # Catch a URLError exception - something went wrong when connecting! except urllib2.URLError as e: print "Error when querying the HealthFinder API: ", e # Return the list of results to the calling function. return results
def vecify(v): return [ts.flesch_reading_ease(v), # ts.smog_index(v), ts.flesch_kincaid_grade(v), ts.coleman_liau_index(v), ts.automated_readability_index(v), ts.dale_chall_readability_score(v), ts.difficult_words(v), ts.linsear_write_formula(v), ts.gunning_fog(v)]
def calc_flesch_level(self): #206.835 - (total words/total sentences) * 1.015 - (total syllables / total words) * 84.6 #print(sum([len(word)/3. if word not in arpabet else len(arpabet[word][0]) for word in self.word_tok[0] if word not in punctuation_set])) #print(self.sent_len[0], self.word_count[0]) #input() #self.flesch_level = [sl * 0.39 + 11.8 * np.mean( # [len(word)/3. if word not in arpabet else len(arpabet[word][0]) # for word in text if word not in punctuation_set]) - 15.59 # for text, sl, wc in zip(self.word_tok, self.sent_len, self.word_count)] self.flesch_level = [textstat.flesch_reading_ease(' '.join(text)) for text in self.sent_tok]
def fleschScore(data): # For some reason the textstat works for sentences or strings # longer than 3 words and if its less it gives that error # but the documents are quite large in text so it wont be a problem # but for safety reasons we catch the error and give 0 try: readingScore = textstat.flesch_reading_ease(data) except TypeError: readingScore = 0.0 return readingScore
def calculate_stats(content): try: testimonial = TextBlob(content) polarity = '%.3f'%(testimonial.sentiment.polarity) subjectivity = '%.3f'%(testimonial.sentiment.subjectivity) flesh_score = '%.3f'%(textstat.flesch_reading_ease(content)) return {'polarity': polarity, 'subjectivity': subjectivity, 'flesh_score': flesh_score} except: return {'polarity': 0, 'subjectivity': 0, 'flesh_score': 0}
def textstat_analysis(profile_text): fre = textstat.flesch_reading_ease(profile_text) smog = textstat.smog_index(profile_text) fkg = textstat.flesch_kincaid_grade(profile_text) coleman = textstat.coleman_liau_index(profile_text) ari = textstat.automated_readability_index(profile_text) dale = textstat.dale_chall_readability_score(profile_text) dw = textstat.difficult_words(profile_text) lwf = textstat.linsear_write_formula(profile_text) gf = textstat.gunning_fog(profile_text) rc = textstat.readability_consensus(profile_text) word_count = textstat.lexicon_count(profile_text) return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
def displayResults( path ): print "stats" text = loadText(path) raw_tokens = raw_tokenize(text) print "number of words %s" %count_words(text) print "number of sentences %s" %textstat.sentence_count(text) print "uniques words: %s" %len(set(raw_tokenize(text))) print "Difficulty %s / 100 " %(100 - textstat.flesch_reading_ease(text)) print "Average sentiment %s (negative: 0, neutral: 5, positive: 10)"%calculateSentiment(raw_tokens) print print "topic distribution" displayTopicsDistributionWithinTheText(path, 300, pie = False) print "difficulty over the text " complexityAlongtheText( path, 300)
def age_feature(text, feature_vect): """ Extract age features :param text: :param feature_vect: contains a bag of words :return:a dictionary which contains the feature and its computed value """ tokens = word_tokenize(text.lower()) features = {} for word in feature_vect: features['contains(%s)' % word] = (word in set(tokens)) return dict(features, **dict({'FRE': textstat.flesch_reading_ease(text), 'FKGL': textstat.flesch_kincaid_grade(text)}))
def __load_text(self): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding = 'utf8', errors = 'ignore') as f: data = f.read() self.flesch_reading_ease = textstat.flesch_reading_ease(data) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data) sentences = tokenizer.tokenize(data) self.n_sentences = textstat.sentence_count(data) self.avg_sentence_length = textstat.lexicon_count(data, True) * 1. / self.n_sentences self.avg_word_length = np.mean([len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english')]) print 'Parse ', len(sentences), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length self.sentences = sentences self.tokens = [] [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
def run_healthfinder_query(search_terms, read_min, read_max, pol_min, pol_max, sub_min, sub_max): root_url = 'http://healthfinder.gov/developer/' search_type = 'Search.json' query = urllib2.quote(search_terms) search_url = "{0}{1}?api_key={2}&keyword={3}".format( root_url, search_type, HEALTHFINDER_API_KEY, query ) results = [] try: response = urllib2.urlopen(search_url).read() json_response = json.loads(response) for result in json_response["Result"]["Topics"]: print "here" summary = result["Sections"][0]["Description"] blobSummary = TextBlob(summary) read = textstat.flesch_reading_ease(summary) pola = float("%.2f" % blobSummary.sentiment.polarity) subj = float("%.2f" % blobSummary.sentiment.subjectivity) if (read_min <= read <= read_max) and (pol_min <= pola <= pol_max) and (subj <= sub_max and subj >= sub_min): results.append({ 'title':result["Title"], 'url':result["AccessibleVersion"], 'summary':result["Sections"][0]["Description"], 'read':read, 'pola':pola, 'subj':subj, 'source':'HealthFinder' }) except urllib2.URLError as e: print "Error when querying the HealthFinder API: ", e return results
def run_medline_query(search_terms, read_min, read_max, pol_min, pol_max, sub_min, sub_max): root_url = 'https://wsearch.nlm.nih.gov/ws/query' source = 'healthTopics' query = urllib.quote(search_terms) query = query.replace('%2B','+') query = query.replace('%27','%22') search_url = "{0}?db={1}&term={2}&rettype=brief".format( root_url, source, query) results = [] try: response = urllib2.urlopen(search_url).read() response = xmltodict.parse(response) for result in response['nlmSearchResult']['list']['document']: summary = re.sub('\<.*?>','', result['content'][-1]['#text']) blobSummary = TextBlob(summary) read = textstat.flesch_reading_ease(summary) pola = float("%.2f" % blobSummary.sentiment.polarity) subj = float("%.2f" % blobSummary.sentiment.subjectivity) if (read_min <= read <= read_max) and (pol_min <= pola <= pol_max) and (subj <= sub_max and subj >= sub_min): results.append({ 'title':re.sub('\<.*?\>','', result['content'][0]['#text']), 'url':result['@url'], 'summary':re.sub('\<.*?\>','', result['content'][-1]['#text']), 'read':read, 'pola':pola, 'subj':subj, 'source':'MedLine' }) except urllib2.URLError as e: print "Error when querying the MedLine API: ", e return results
def main() : for arg in sys.argv[1:]: with open(arg) as f: text = f.read() with open(arg + '.readability.snip','w') as f: f.write ("syllable_count : %s\n" % textstat.syllable_count(text)) f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text)) f.write ("sentence_count : %s\n" % textstat.sentence_count(text)) f.write ("difficult_words : %s\n" % textstat.difficult_words(text)) f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text)) f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text)) f.write ("smog_index : %s\n" % textstat.smog_index(text)) f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text)) f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text)) f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text)) f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
def getReadingLevel(subreddit): query = '''SELECT body FROM (SELECT body, RAND() AS r1 FROM [fh-bigquery:reddit_comments.''' + str(year) + '''] WHERE subreddit == "''' + subreddit + '''" AND body != "[deleted]" AND body != "[removed]" AND score > 1 ORDER BY r1 LIMIT 1000) ''' bigquery_service = build('bigquery', 'v2', credentials=credentials) try: query_request = bigquery_service.jobs() query_data = { 'query': query, 'timeoutMs': 20000 } query_response = query_request.query( projectId=bigquery_pid, body=query_data).execute() except HttpError as err: print('Error: {}'.format(err.content)) raise err rows = query_response['rows'] levels_sum = 0.0 levels_count = 0 for i in range(len(rows)): text = rows[i]['f'][0]['v'] text = re.sub('([A-Za-z]+:\/\/[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)|([A-Za-z]+\.[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)', '', text) #url get rid text = re.sub('\s\s+', ' ', text) if textstat.sentence_count(text) > 0: levels_sum += textstat.flesch_reading_ease(text) levels_count += 1 average_level = 0.0 if levels_count > 0: average_level = levels_sum / levels_count results[subreddits.index(subreddit)] = [subreddit, 100.0 - average_level]
def main(): mongo_client = MongoClient('mongodb://*****:*****@' '107.170.215.176:27017') reddit_data = mongo_client.reddit_data user_data = reddit_data.user_data user_reading_level = reddit_data.user_reading_level user_comments = reddit_data.user_comments user_reading_level.create_index( [("username", pymongo.ASCENDING)], background=True, unique=True, dropDups=True ) for user in user_data.find(no_cursor_timeout=True).sort('data.name', 1): name = user['data']['name'] print name comment_list = [] for comment in user_comments.find({'data.author': name}): if comment['kind'] == 't1': # Actually a comment comment_text = comment['data']['body'] comment_list.append(comment_text) comment_book = ' '.join(comment_list).strip() try: if len(comment_book) > 0: reading_ease = textstat.flesch_reading_ease(comment_book) else: reading_ease = 0 except TypeError: # I hate textstat reading_ease = 0 reading_level_data = {'username': name, 'reading_level': reading_ease} try: user_reading_level.insert_one(reading_level_data) except pymongo.errors.DuplicateKeyError: continue
def healthapif(search): healthDict = {} health_api = "http://healthfinder.gov/developer/Search.xml?api_key=gnviveyezcuamzei&keyword=" keywords = " ".join(search) search = keywords.replace('"',"%22") search = search.replace(' ',"%20") health_api += search j = requests.get(health_api).content root = ET.fromstring(j) # get data from Url tags only for topics in root.findall('Topics'): for topic in topics.findall('Topic'): try: title = topic.find('Title').text url = topic.find('AccessibleVersion').text healthDict[url] = [title] except AttributeError: title = keywords #if there is no title content = findContentHealth(url) if content != "empty": # Reading ease score score = textstat.flesch_reading_ease(content) scoreText = readingEaseScore(score) healthDict[url].append(str(score) + " (" + scoreText + ")") else: healthDict[url].append("-") # Sentiment score blob = TextBlob(content) sentimentPolarity = blob.sentiment.polarity sentimentSubjectivity = blob.sentiment.subjectivity sentimentScore = "polarity= %.3f (%s), subjectivity= %.3f (%s)" % (sentimentPolarity, polarityScore(sentimentPolarity), sentimentSubjectivity, subjectivityScore(sentimentSubjectivity)) healthDict[url].append(sentimentScore) return healthDict
def calculate_readability_measures(id): """ Count the words in doc and update the document. """ es = elasticsearch.Elasticsearch() source = es.get_source(index='beek', doc_type='page', id=id) # count = len(source['content'].split()) try: measures = { 'flesch': textstat.flesch_reading_ease(source['content']), 'smog': textstat.smog_index(source['content']), 'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']), 'coleman_liau': textstat.coleman_liau_index(source['content']), 'readability': textstat.automated_readability_index(source['content']), 'dale_chall': textstat.dale_chall_readability_score(source['content']), 'difficult_words': textstat.difficult_words(source['content']), 'linsear_write_formula': textstat.linsear_write_formula(source['content']), 'gunning_fog': textstat.gunning_fog(source['content']), 'consensus': textstat.readability_consensus(source['content']), } es.update(index='beek', doc_type='page', id=id, body={'doc': {'measures': measures}}, refresh=True) except Exception as err: pass
def analyze(text): # Automatically reject if no input if text.isspace(): return -1.0 if text.startswith('http'): return -1.0 # Analyze text try: x = textstat.flesch_reading_ease(text) except: return -1.0 # Keep outputs valid if not isinstance(x, float): return -1.0 if x < 0: return -1.0 if x > 100: return 100.0 return x
def fic2text(ident): textsegs = Loader.get_field(data['fics'],ident,'fic') rtags = Loader.get_field(data['base'],ident,'tags') rtext = "" for line in textsegs: line = line.replace(u'\xa0',' ') s = re.sub('([.,!?()])', r' \1 ', line) s = re.sub('\s{2,}', ' ', line) line = line.encode('ascii', 'ignore').decode('ascii') rtext += line+" " tags = [] for genre in rtags: for el in rtags[genre]: tname = el["name"] tags.append(tname) reading_ease = textstat.flesch_reading_ease(rtext) reading_level = textstat.flesch_kincaid_grade(rtext) print(ident,reading_ease,reading_level) #tokens = nltk.word_tokenize(rtext) return tags,rtext
def extract_features_sub(text, dialogue = True): ## aggregate all dialogue, action #scenes = format_script(file_name) if len(text) > 0: try: language_complexity = {'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'automated_readability_index': textstat.automated_readability_index(text)} except: language_complexity = {'flesch_reading_ease': None, 'flesch_kincaid_grade': None, 'automated_readability_index': None} else: #badD.write(movie_name + "\n") language_complexity = {'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0, 'automated_readability_index': 0} lexical_diversity = find_lex_d(text) sentiment = extract_senti_wordnet(text) #print sentiment inquirer_features = general_inquirer_features(text) final_features = {} final_features.update(language_complexity) final_features.update(lexical_diversity) final_features.update(sentiment) final_features.update(inquirer_features) curr_keys = [feature for feature in final_features] if dialogue: new_keys = [feature + "_" + "dialogue" for feature in final_features] else: new_keys = [feature + "_" + "action" for feature in final_features] #print final_features """ if dialogue: for feature in final_features: final_features[feature + "_dialogue"] = final_features.pop(feature) else: for feature in final_features: final_features[feature + "_action"] = final_features.pop(feature) #final_features = language_complexity + lexical_diversity + sentiment + inquirer_features """ return convert(final_features, dict(zip(curr_keys, new_keys)))