def check_reading_level(input_string): level = [ "college graduate", "college", "12th grade", "11th grade", "10th grade", "9th grade", "8th grade", "7th grade", "6th grade", "5th grade" ] grade = [] if textstat.flesch_kincaid_grade(input_string) <= 5: grade.append(level[9]) elif textstat.flesch_kincaid_grade(input_string) <= 6: grade.append(level[8]) elif textstat.flesch_kincaid_grade(input_string) <= 7: grade.append(level[7]) elif textstat.flesch_kincaid_grade(input_string) <= 8: grade.append(level[6]) elif textstat.flesch_kincaid_grade(input_string) <= 9: grade.append(level[5]) elif textstat.flesch_kincaid_grade(input_string) <= 10: grade.append(level[4]) elif textstat.flesch_kincaid_grade(input_string) <= 11: grade.append(level[3]) elif textstat.flesch_kincaid_grade(input_string) <= 12: grade.append(level[2]) elif textstat.flesch_kincaid_grade(input_string) <= 13: grade.append(level[1]) else: grade.append(level[0]) grade_string = " ".join(grade) return grade_string
def other_features(tweet): ##SENTIMENT sentiment = VS(tweet) ##READABILITY #See https://pypi.python.org/pypi/textstat/ flesch = round(textstat.flesch_reading_ease(tweet),3) flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet),3) gunning_fog = round(textstat.gunning_fog(tweet),3) ##TEXT-BASED length = len(tweet) num_terms = len(tweet.split()) ##TWITTER SPECIFIC TEXT FEATURES hashtag_count = tweet.count("#") mention_count = tweet.count("@") url_count = tweet.count("http") retweet = 0 if tweet.lower().startswith("rt") is True: retweet = 1 #Checking if RT is in the tweet words = tweet.lower().split() if "rt" in words or "#rt" in words: retweet = 1 features = [sentiment['compound'],flesch, flesch_kincaid, gunning_fog, length, num_terms, hashtag_count, mention_count, url_count, retweet] return features
def do_text_stats(self, text): ### Syllable Count syllable_count = textstat.syllable_count(text) ### Lexicon Count lexicon_count = textstat.lexicon_count(text, True) ### Sentence Count sentence_count = textstat.sentence_count(text) ### The Flesch Reading Ease formula try: flesch_reading_ease = textstat.flesch_reading_ease(text) except TypeError as e: flesch_reading_ease = None #* 90-100 : Very Easy #* 80-89 : Easy #* 70-79 : Fairly Easy #* 60-69 : Standard #* 50-59 : Fairly Difficult #* 30-49 : Difficult #* 0-29 : Very Confusing ### The The Flesch-Kincaid Grade Level try: flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) except TypeError as e: flesch_kincaid_grade = None ## The Fog Scale (Gunning FOG Formula) gunning_fog = textstat.gunning_fog(text) ### The SMOG Index smog_index = textstat.smog_index(text) ### Automated Readability Index automated_readability_index = textstat.automated_readability_index( text) ### The Coleman-Liau Index try: coleman_liau_index = textstat.coleman_liau_index(text) except TypeError as e: coleman_liau_index = None ### Linsear Write Formula linsear_write_formula = textstat.linsear_write_formula(text) ### Dale-Chall Readability Score dale_chall_readability_score = textstat.dale_chall_readability_score( text) ### Readability Consensus based upon all the above tests try: text_standard = textstat.text_standard(text) except TypeError as e: text_standard = None return { "syllable_count": syllable_count, "lexicon_count": lexicon_count, "sentence_count": sentence_count, "flesch_reading_ease": flesch_reading_ease, "flesch_kincaid_grade": flesch_kincaid_grade, "gunning_fog": gunning_fog, "smog_index": smog_index, "automated_readability_index": automated_readability_index, "coleman_liau_index": coleman_liau_index, "linsear_write_formula": linsear_write_formula, "dale_chall_readability_score": dale_chall_readability_score, "text_standard": text_standard }
def analysis(self): # Find political issues in document existing_issues, trigger_terms = self._find_issues() self.document.issues = json.dumps(existing_issues) # Get text frequency words and basic document stats content_words = get_lowercase(get_words(self.document.content)) self.document.word_count = len(content_words) self.document.unique_word_count = len(set(content_words)) content_words_without_stopwords = remove_stop_words(content_words) self.document.word_frequency = json.dumps(nltk.FreqDist(content_words_without_stopwords).most_common(10)) # Run Textblob's document analysis to get sentiment and noun phrases analysis = TextBlob(self.document.content) self.document.blob_keywords = json.dumps(analysis.noun_phrases) self.document.blob_sentiment = str(analysis.sentiment) #3: RAKE keywords for each page rake_results = self.rake.run(self.document.content) self.document.rake_keywords = json.dumps(rake_results[:5]) # 4 Sentiment self.document.nltk_sentiment = self._check_sentiment() # 5 Readability if self.document.content.strip(): self.document.readability = textstat.flesch_kincaid_grade(self.document.content) # Update Summary Object with Issue Information for issue in existing_issues: key = issue.get('key') if key not in summary['issues']: summary['issues'][key] = { 'examples': [], 'content': ''} summary['issues'][key]['content'] += (' ' + self.document.content) summary['issues'][key]['examples'].append({ 'title': self.document.title, 'word_count': self.document.word_count, 'unique_word_count': self.document.unique_word_count, 'sentiment': self.document.nltk_sentiment, 'readability': self.document.readability })
def get_text_features(article_contents: str) -> dict: """ Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates other factors such as the number of typos. @param article_contents, a string which contains the contents of an article @return language_analysis_dict, a dictionary which contains """ tool = language_check.LanguageTool('en-US') language_analysis_dict = { "flesch_reading": textstat.flesch_reading_ease(article_contents), "flesch_kincaid": textstat.flesch_kincaid_grade(article_contents), "coleman_liau": textstat.coleman_liau_index(article_contents), "typos_to_words": len(tool.check(article_contents)) / textstat.lexicon_count(article_contents), "percent_difficult_words": textstat.difficult_words(article_contents) / textstat.lexicon_count(article_contents), } return language_analysis_dict
def _get_reading_stats(no_code_text): """ Returns reading level information :param no_code_text: String to analyse :return: list of details """ group_by = 'Reading Level Analysis ' results = [] results.append(TextFeature('Flesch Reading Ease', textstat.flesch_reading_ease(no_code_text), group_by)) # higher is better, scale 0 to 100 results.append(TextFeature('Flesch-Kincaid Grade Level', textstat.flesch_kincaid_grade(no_code_text), group_by)) try: results.append(TextFeature('The Fog Scale (Gunning FOG formula)', textstat.gunning_fog(no_code_text), group_by)) except IndexError: # Not sure why, but this test throws this error sometimes results.append(TextFeature('The Fog Scale (Gunning FOG formula)', "Undetermined", group_by)) try: results.append(TextFeature('The SMOG Index', textstat.smog_index(no_code_text), group_by)) except IndexError: # Not sure why, but this test throws this error sometimes results.append(TextFeature('The SMOG Index', "Undetermined", group_by)) results.append(TextFeature('Automated Readability Index', textstat.automated_readability_index(no_code_text), group_by)) results.append(TextFeature('The Coleman-Liau Index', textstat.coleman_liau_index(no_code_text), group_by)) try: results.append(TextFeature('Linsear Write Formula', textstat.linsear_write_formula(no_code_text), group_by)) except IndexError: results.append(TextFeature('Linsear Write Formula', "Undetermined", group_by)) try: results.append(TextFeature('Dale Chall Readability Score', textstat.dale_chall_readability_score(no_code_text), group_by)) except IndexError: # Not sure why, but this test throws this error sometimes results.append(TextFeature('Dale Chall Readability Score', "Undetermined", group_by)) try: results.append(TextFeature('Readability Consensus', textstat.readability_consensus(no_code_text), group_by)) except (TypeError, IndexError): results.append(TextFeature('Readability Consensus', "Undetermined; One of the tests above failed.", group_by)) return results
def get_special_metrics(text): blob = TextBlob(text) main = { "statistics": { "syllables": textstat.syllable_count(text), "words": textstat.lexicon_count(text), "characters": textstat.char_count(text), "polysyllables": textstat.polysyllabcount(text), "average letter per word": textstat.avg_letter_per_word(text), "average sentence length": textstat.avg_sentence_length(text), "average sentence per word": textstat.avg_sentence_per_word(text), "sentences": textstat.sentence_count(text), }, "difficulty": { "flesch reading ease": textstat.flesch_reading_ease(text), "smog index": textstat.smog_index(text), "flesch kincaid grade": textstat.flesch_kincaid_grade(text), "coleman liau index": textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), "gunning fog": textstat.gunning_fog(text), }, "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity}, } return main
def readability(text): print("Readability\n=================================\n\n") print("Flesch Reading Ease\n________________________\n\n") print str(textstat.flesch_reading_ease(text)) + "\n" print("Smog Index\n________________________\n\n") print str(textstat.smog_index(text)) + "\n" print("Flesch Kincaid Grade\n________________________\n\n") print str(textstat.flesch_kincaid_grade(text)) + "\n" print("Coleman Liau Index\n________________________\n\n") print str(textstat.coleman_liau_index(text)) + "\n" print("ARI\n________________________\n\n") print str(textstat.automated_readability_index(text)) + "\n" print("Dale Chall\n________________________\n\n") print str(textstat.dale_chall_readability_score(text)) + "\n" print("Difficult Words\n________________________\n\n") print str(textstat.difficult_words(text)) + "\n" print("Linsear Write Formula\n________________________\n\n") print str(textstat.linsear_write_formula(text)) + "\n" print("Gunning Fog\n________________________\n\n") print str(textstat.gunning_fog(text)) + "\n" print "Compiled Score\n_____________________________\n\n" print str(textstat.text_standard(text)) + "\n" return len(adjectives)
def calculate_statistics(lyrics): """ Calculates statistics based on the text_raw of the lyrics. :return: Annotated lyrics containing information about the songs """ logging.info("Calculating Statistics") from textstat.textstat import textstat for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)): try: song["num_syllables"] = textstat.syllable_count(song["text_raw"]) song["num_words"] = textstat.lexicon_count(song["text_raw"]) song["num_sentences"] = textstat.sentence_count(song["text_raw"]) song["flesch_score"] = textstat.flesch_reading_ease( song["text_raw"]) song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade( song["text_raw"]) song["fog_score"] = textstat.gunning_fog(song["text_raw"]) song[ "num_difficult_words"] = textstat.dale_chall_readability_score( song["text_raw"]) except Exception as e: logging.error( "Something bad happened in the current song ! Skipping it... \n{}" .format(song)) logging.exception(e) return lyrics
def readability(text, file): fog = textstat.gunning_fog(text) fres = textstat.flesch_reading_ease(text) fkgl = textstat.flesch_kincaid_grade(text) file.write( '\nGunning Fog Index: %d \nFlesch Reading Ease: %d \nFlesch-Kincaid Grade: %d' % (fog, fres, fkgl))
def decide_if_assigned(self, person): reading_levels = {} for my_product in person.all_products: text = "" if my_product.title: text += u" " + my_product.title if my_product.get_abstract_using_mendeley(): text += u" " + my_product.get_abstract_using_mendeley() # only do if at least three words between periods, # otherwise textstat library prints too many Not Enough Words error messages if text: sentences = text.split(".") if any([len(sentence.split()) > 3 for sentence in sentences]): try: grade_level = textstat.flesch_kincaid_grade(text) # print u"grade level is {} for {}; text: {}".format(grade_level, my_product.doi, text) if grade_level > 0: # is sometimes negative, strangely. examples in ethan's profile reading_levels[my_product.doi] = grade_level except TypeError: #if text is too short it thows this pass if reading_levels.values(): average_reading_level = sum(reading_levels.values()) / float( len(reading_levels)) if average_reading_level <= 14: self.candidate_badge.value = average_reading_level self.assigned = True
def main(): csv_file2 = open(sys.argv[2], 'w', encoding="utf8") writer = csv.writer(csv_file2, delimiter=',') doc_id = 1 writer.writerow(["ID", "URL", "text", "impact-score", "readability", "grade-level", "smog-index", "total-words", "total-sentences"]) with open(sys.argv[1], 'r', encoding="utf8", errors='ignore') as csv_file1: reader = csv.reader(csv_file1) # Skip the first line with headers next(reader) for row in reader: impact = str(row[0]) url = str(row[1]) text = str(row[2]) read_ease = textstat.flesch_reading_ease(text) grade = textstat.flesch_kincaid_grade(text) smog = textstat.smog_index(text) words = textstat.lexicon_count(text) sentences = textstat.sentence_count(text) # Uncomment this if we want summary and key words # summary = summarize(text, ratio=0.3) # key_words = keywords(text, ratio=0.3) writer.writerow([doc_id]+[url]+[text]+[impact]+[read_ease]+[grade]+[smog]+[words]+[sentences]) doc_id = doc_id+1 csv_file1.close() csv_file2.close() print('Summary statistics complete!')
def compareContents(): if request.method == "POST": line = request.form['poem'] poem1 = request.form['poem1'] #---------Metrics comparison logic goes here. keep them in session attributes-----------------------# session['line'] = line #print("i am in row : ",row) #print "Tagline :", line #print("no of words= ",len(line.split())) #line1 = line.lstrip('0123456789.- ,') #print "flesch_reading_ease = ",textstat.flesch_reading_ease(line) fre = textstat.flesch_reading_ease(line) session['fre'] = fre #print "smog_index = ",textstat.smog_index(line) smog = textstat.smog_index(line) session['smog'] = smog #print "flesch_kincaid_grade = ",textstat.flesch_kincaid_grade(line) fkg = textstat.flesch_kincaid_grade(line) session['fkg'] = fkg #print "dale_chall_readability_score = ", textstat.dale_chall_readability_score(line) dcr = textstat.dale_chall_readability_score(line) session['dcr'] = dcr #print "gunning_fog = ",textstat.gunning_fog(line) gf = textstat.gunning_fog(line) session['gf'] = gf metrics = True return render_template('compareContents.html',metrics=metrics, line=line, fre=fre, smog=smog, fkg=fkg, dcr=dcr,gf=gf) return render_template('compareContents.html')
def getReadabilityStats(text): # get scores fleschGrade = textstat.flesch_kincaid_grade(text) # store return {'fleschGrade': fleschGrade}
def flesch_grade_score(): df.drop(['BodyFleschKinkaidGradeLevel'], inplace=True, axis=1, errors='ignore') print df.shape, "dropped a m**********r" tokenizer = RegexpTokenizer(r'\w+') final_flesch_kincaid_grade_score = [] for index, row in df.iterrows(): valid_words = [] body_only = re.sub('<code>[^>]+</code>', '', row['Body']) soup = BeautifulSoup(body_only, "lxml") word_tokens = tokenizer.tokenize(soup.text) for word in word_tokens: if not_punctuation(word): valid_words.append(word) word_count = len(valid_words) print "word_count of ", index, " - ", word_count tag_removed_text = soup.text tag_removed_text = tag_removed_text.replace("\n", "") # syllables_count = get_syllables_count(valid_words) # print "inside flesch for loop - ",index # sentence_token = sent_tokenize(tag_removed_text) # sentences_count = len(sentence_token) if word_count != 0: flesch_kincaid_grade_score = textstat.flesch_kincaid_grade( tag_removed_text) else: flesch_kincaid_grade_score = 0 print "flesch_grade_score of ", index, " - ", flesch_kincaid_grade_score final_flesch_kincaid_grade_score.append(flesch_kincaid_grade_score) df['BodyFleschKinkaidGradeLevel'] = final_flesch_kincaid_grade_score df.to_csv("combined.csv")
def validate_readability_english(d): # Run the supplied string through the Flesch-Kincaid readability grade test score = textstat.flesch_kincaid_grade(d) if score <= 8: return 1 else: return 0
def f(): print("hello") book = xlwt.Workbook() worksheet = book.add_sheet('ReadabilityScore') worksheet.write(0, 0, "Gen_sent") worksheet.write(0, 1, "flesch_reading_ease") worksheet.write(0, 2, "flesch_kincaid_grade") worksheet.write(0, 3, "dale_chall_readability_score") worksheet.write(0, 4, "gunning_fog") f = open('abc.txt') #, encoding='utf-8') row = 1 for line in iter(f): #print("i am in row : ",row) #print "Tagline :", line worksheet.write(row, 0, line) #print("no of words= ",len(line.split())) #line1 = line.lstrip('0123456789.- ,') #print "flesch_reading_ease = ",textstat.flesch_reading_ease(line) fre = textstat.flesch_reading_ease(line) worksheet.write(row, 1, fre) #print "smog_index = ",textstat.smog_index(line) smog = textstat.smog_index(line) #print "flesch_kincaid_grade = ",textstat.flesch_kincaid_grade(line) fkg = textstat.flesch_kincaid_grade(line) worksheet.write(row, 2, fkg) #print "dale_chall_readability_score = ", textstat.dale_chall_readability_score(line) dcr = textstat.dale_chall_readability_score(line) worksheet.write(row, 3, dcr) #print "gunning_fog = ",textstat.gunning_fog(line) gf = textstat.gunning_fog(line) worksheet.write(row, 4, gf) row += 1 book.save('Readability_Scores.xls')
def other_features(tweet): ##SENTIMENT sentiment = VS(tweet) ##READABILITY #See https://pypi.python.org/pypi/textstat/ flesch = round(textstat.flesch_reading_ease(tweet), 3) flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet), 3) gunning_fog = round(textstat.gunning_fog(tweet), 3) ##TEXT-BASED length = len(tweet) num_terms = len(tweet.split()) ##TWITTER SPECIFIC TEXT FEATURES hashtag_count = tweet.count("#") mention_count = tweet.count("@") url_count = tweet.count("http") retweet = 0 if tweet.lower().startswith("rt") is True: retweet = 1 #Checking if RT is in the tweet words = tweet.lower().split() if "rt" in words or "#rt" in words: retweet = 1 features = [ sentiment['compound'], flesch, flesch_kincaid, gunning_fog, length, num_terms, hashtag_count, mention_count, url_count, retweet ] return features
def main(): """ Evaluate and print Readability scores """ if len(sys.argv) > 1: inf = open(sys.argv[1], 'r') else: sys.stderr.write('Error: specify input file.\n') sys.exit() text = inf.read() inf.close() lexcount = textstat.lexicon_count(text) sys.stdout.write('Lexicon count: {0:d}\n'.format(lexcount)) # reading time in minutes # assumes 180 WPM plus some offset tread = (lexcount + 250) / 180. sys.stdout.write('Estimating reading time: {0:1.1f} minutes.\n'.format(tread)) ease = textstat.flesch_reading_ease(text) grade = textstat.flesch_kincaid_grade(text) sys.stdout.write('Flesch reading ease score: {0:1.1f}\n'.format(ease)) sys.stdout.write('Flesch-Kincaid grade: {0:1.1f}\n'.format(grade))
def get_special_metrics(text): blob = TextBlob(text) main = { 'statistics': { 'syllables': textstat.syllable_count(text), 'words': textstat.lexicon_count(text), 'characters': textstat.char_count(text), 'polysyllables': textstat.polysyllabcount(text), 'average letter per word': textstat.avg_letter_per_word(text), 'average sentence length': textstat.avg_sentence_length(text), 'average sentence per word': textstat.avg_sentence_per_word(text), 'sentences': textstat.sentence_count(text) }, 'difficulty': { 'flesch reading ease': textstat.flesch_reading_ease(text), 'smog index': textstat.smog_index(text), 'flesch kincaid grade': textstat.flesch_kincaid_grade(text), 'coleman liau index': textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), 'gunning fog': textstat.gunning_fog(text) }, 'sentiments': { 'polarity': blob.sentiment.polarity, 'subjectivity': blob.sentiment.subjectivity } } return main
def get_readability(df2): df = df2.copy() text_feats = df.select_dtypes(include=['object']).columns.values for i, col in enumerate(text_feats): df['flesch_reading_ease{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_reading_ease(x)) df['smog_index{}'.format(i)] = df[col].apply( lambda x: textstat.smog_index(x)) df['flesch_kincaid_grade{}'.format(i)] = df[col].apply( lambda x: textstat.flesch_kincaid_grade(x)) df['coleman_liau_index{}'.format(i)] = df[col].apply( lambda x: textstat.coleman_liau_index(x)) df['automated_readability_index{}'.format(i)] = df[col].apply( lambda x: textstat.automated_readability_index(x)) df['dale_chall_readability_score{}'.format(i)] = df[col].apply( lambda x: textstat.dale_chall_readability_score(x)) df['difficult_words{}'.format(i)] = df[col].apply( lambda x: textstat.difficult_words(x)) df['linsear_write_formula{}'.format(i)] = df[col].apply( lambda x: textstat.linsear_write_formula(x)) df['gunning_fog{}'.format(i)] = df[col].apply( lambda x: textstat.gunning_fog(x)) df['text_standard{}'.format(i)] = df[col].apply( lambda x: textstat.text_standard(x)) return df
def decide_if_assigned_threshold(self, person, threshold): reading_levels = {} for my_product in person.all_products: text = "" if my_product.title: text += u" " + my_product.title if my_product.get_abstract_using_mendeley(): text += u" " + my_product.get_abstract_using_mendeley() # only do if at least three words between periods, # otherwise textstat library prints too many Not Enough Words error messages if text: sentences = text.split(".") if any([len(sentence.split())>3 for sentence in sentences]): try: grade_level = textstat.flesch_kincaid_grade(text) # print u"grade level is {} for {}; text: {}".format(grade_level, my_product.doi, text) if grade_level > 0: # is sometimes negative, strangely. examples in ethan's profile reading_levels[my_product.doi] = grade_level except TypeError: #if text is too short it thows this pass if reading_levels.values(): average_reading_level = sum(reading_levels.values()) / float(len(reading_levels)) if average_reading_level <= 14: self.candidate_badge.value = average_reading_level self.assigned = True
def text_analysis(x): for result in x: blob = TextBlob(result['summary']) for text in blob.sentences: result['pola'] = '%.2f' % (abs(text.sentiment.polarity*10)/2) #-1 to 1 result['subj'] = '%.2f' % (abs(text.sentiment.subjectivity*10)/2) #-1 to 1 result['reada'] = '%.2f' % textstat.flesch_kincaid_grade(result['summary']) #out of 10 return x
def textConfidence(fname): with PyTessBaseAPI() as api: #for image in images: api.SetImageFile(fname) text = api.GetUTF8Text() #print api.AllWordConfidences() print textstat.flesch_kincaid_grade(text) print textstat.flesch_reading_ease(text) print("90-100 : Very Easy") print("80-89 : Easy") print("70-79 : Fairly Easy") print("60-69 : Standard") print("50-59 : Fairly Difficult") print("30-49 : Difficult") print("0-29 : Very Confusing")
def get_statistics(self, f, content): content = content.lower() reading_level = textstat.flesch_kincaid_grade(content) word_count = textstat.lexicon_count(content) keyword_frequency = map(lambda x: x[1], self.get_keyword_frequency(content)) sentiment = DocumentStatistics.get_sentiment(content) return [f, reading_level, word_count] + keyword_frequency + sentiment
def vecify(v): return [ts.flesch_reading_ease(v), # ts.smog_index(v), ts.flesch_kincaid_grade(v), ts.coleman_liau_index(v), ts.automated_readability_index(v), ts.dale_chall_readability_score(v), ts.difficult_words(v), ts.linsear_write_formula(v), ts.gunning_fog(v)]
def add_reading_levels(df): for row,body in enumerate(df['body']): x = df['body'][row] df.loc[row,'flesch_kincaid']=textstat.flesch_kincaid_grade(x) df.loc[row,'fk_score']=textstat.flesch_reading_ease(x) #df.loc[row,'smog_index']=textstat.smog_index(x) df.loc[row,'gunning_fog']=textstat.gunning_fog(x) #df.loc[row,'difficult_words']=textstat.difficult_words(x) #df.loc[row,'text_standard']=textstat.text_standard(x) return df
def complex_str_pipeline(s): sh_ent = renyi_entropy(s, alpha=1) # shannon col_ent = renyi_entropy(s, alpha=2) # collision sh_delta = shannon_ideal(s, logbase=2) - sh_ent # distance to ideal encoding f_ease = ts.flesch_reading_ease(s) # Flesch reading ease fk_grade = ts.flesch_kincaid_grade(s) # Flesch–Kincaid grade level lix_scr = lix(s) #return {'Shannon': sh_ent,'Collision': col_ent, 'Delta': sh_delta, #'Flesch_ease': f_ease, 'Flesch_Kincaid': fk_grade, 'LIX': lix_scr} return [sh_ent, col_ent, sh_delta, f_ease, fk_grade, lix_scr]
def all_trad_scores(text): fre = textstat.flesch_reading_ease(text) fkg = textstat.flesch_kincaid_grade(text) smog = textstat.smog_index(text) cole = textstat.coleman_liau_index(text) ari = textstat.automated_readability_index(text) dale = textstat.dale_chall_readability_score(text) linsear = textstat.linsear_write_formula(text) gunning = textstat.gunning_fog(text) return [fre, fkg, smog, cole, ari, dale, linsear, gunning]
def reading_level(lyrics): r = textstat.flesch_kincaid_grade(lyrics) if r >= 90: return '5th Grade' elif r >= 65: return 'Middle School' elif r >= 50: return 'High School' elif r >= 30: return 'College' else: return 'College Graduate'
def score_text(self, txt): ease = float('%.2f' % (100 - textstat.flesch_reading_ease(txt))) grade_raw = float('%.2f' % textstat.flesch_kincaid_grade(txt)) grade = grade_raw if grade <= 12 else '12+ (%s)' % grade_raw self.file.config(text='Filename: %s' % self.filename) self.ease.config(text='Flesch Reading Ease scale score: %s' % ease) self.grade.config(text='Flesch-Kincaid Grade scale score: %s' % grade) if self.filename not in self.files: self.files.append(self.filename[:self.filename.find('.')]) self.easeScores.append(ease) self.gradeScores.append(grade_raw)
def predict_trust(self, profile, strip_html=True): """Predicts the trustworthiness of a profile. Segments the input with sentence-level granularity, returning the probability that the profile represented by the input is perceived to be more trustworthy compared to other profiles of similar length. Args: profile: An Airbnb host profile, as a string. strip_html: Whether HTML tags in the input should be stripped. True by default, but can be disabled for speed if the input is known to be sanitized. Returns: An AirProfile.Prediction object for trustworthiness of the profile. Raises: ValueError: If the input is an invalid or empty string. IOError: If the LIWC trie is not available. """ if not (profile and profile.strip()): raise ValueError if not (self.__liwc_path.exists() and self.__liwc_path.is_file()): raise IOError sentence_tokens = self.__preprocess(profile, strip_html) liwc_features = self.__liwc.summarize(profile, sentence_tokens) word_count = liwc_features['WC'] liwc_features['wc_log'] = np.log(word_count) liwc_features['readability'] = ts.flesch_kincaid_grade( profile.decode('utf-8')) prediction_agg = np.empty(len(self.__classifier_cat)) for sent in sentence_tokens: prediction_agg += np.array( [c.predict for c in self.__classify_sentence(sent)]) for idx, cat in enumerate(FEAT_WC_CATEGORIES): liwc_features[cat] = prediction_agg[idx] feats = [ liwc_features[f] for f in AirProfile.__get_trust_model_feat_cols(word_count) ] feats_shape = np.array(feats).reshape(1, -1) model = self.__get_classifier_trust( AirProfile.__get_trust_model_fname(word_count)) return self.Prediction( np.round(model.predict_proba(feats_shape)[0][1], 2), model.predict(feats_shape)[0])
def reading_difficulty(self): diff_words = textstat.difficult_words(self.text) / self.nword flesch_kincaid = textstat.flesch_kincaid_grade(self.text) coleman_liau = textstat.coleman_liau_index(self.text) ari = textstat.automated_readability_index(self.text) dale_chall = textstat.dale_chall_readability_score(self.text) linsear = textstat.linsear_write_formula(self.text) gunning_fog = textstat.gunning_fog(self.text) - 6 smog = textstat.smog_index(self.text) avg_grade = max( math.ceil((flesch_kincaid + coleman_liau + ari + dale_chall + linsear + gunning_fog + smog) / 7), 12) return avg_grade, diff_words
def textstat_analysis(profile_text): fre = textstat.flesch_reading_ease(profile_text) smog = textstat.smog_index(profile_text) fkg = textstat.flesch_kincaid_grade(profile_text) coleman = textstat.coleman_liau_index(profile_text) ari = textstat.automated_readability_index(profile_text) dale = textstat.dale_chall_readability_score(profile_text) dw = textstat.difficult_words(profile_text) lwf = textstat.linsear_write_formula(profile_text) gf = textstat.gunning_fog(profile_text) rc = textstat.readability_consensus(profile_text) word_count = textstat.lexicon_count(profile_text) return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
def c(): resp = table.scan(FilterExpression=Attr("fips").exists()) sorted_by_fips = {} sorted_by_fips['items'] = [] for item in resp['Items']: concat = " ".join(item['tweet']) polarity = textstat.flesch_kincaid_grade(concat) sorted_by_fips['items'].append({ 'id': str(item['fips']), 'rate': polarity }) return json.dumps(sorted_by_fips)
def get_readability(contents): readability = [] readability.append(textstat.flesch_reading_ease(contents)) readability.append(textstat.smog_index(contents)) readability.append(textstat.flesch_kincaid_grade(contents)) readability.append(textstat.automated_readability_index(contents)) readability.append(textstat.dale_chall_readability_score(contents)) readability.append(textstat.difficult_words(contents)) readability.append(textstat.linsear_write_formula(contents)) readability.append(textstat.gunning_fog(contents)) readability.append(textstat.coleman_liau_index(contents)) readability.append(textstat.text_standard(contents)) return readability
def __load_text(self): tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding = 'utf8', errors = 'ignore') as f: data = f.read() self.flesch_reading_ease = textstat.flesch_reading_ease(data) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data) sentences = tokenizer.tokenize(data) self.n_sentences = textstat.sentence_count(data) self.avg_sentence_length = textstat.lexicon_count(data, True) * 1. / self.n_sentences self.avg_word_length = np.mean([len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english')]) print 'Parse ', len(sentences), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length self.sentences = sentences self.tokens = [] [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
def age_feature(text, feature_vect): """ Extract age features :param text: :param feature_vect: contains a bag of words :return:a dictionary which contains the feature and its computed value """ tokens = word_tokenize(text.lower()) features = {} for word in feature_vect: features['contains(%s)' % word] = (word in set(tokens)) return dict(features, **dict({'FRE': textstat.flesch_reading_ease(text), 'FKGL': textstat.flesch_kincaid_grade(text)}))
def flesch_from_list(frequent_word_list): ''' If we want advanced text, this function allows us to run Flesch Kincaid on a list of frequent words that we turn back into a string to process. Input: frequent_word_list: a list of frequent words Returns: This functions returns the Flesch Kincaid grade of the most frequent words. ''' freq_words_string = " ".join(frequent_word_list) return textstat.flesch_kincaid_grade(freq_words_string)
def analyze_one(self, email): """ Analyzes a single email and stores results. """ sents = tstat.sentence_count(email) self.sent_count.append(sents if sents > 0 else 1) if email and len(email) > 0: self.flesch_kincaid_grade.append(tstat.flesch_kincaid_grade(email)) self.automated_readability_index.append( tstat.automated_readability_index(email)) self.coleman_liau_index.append(tstat.coleman_liau_index(email)) self.linsear_write_formula.append( tstat.linsear_write_formula(email)) self.dale_chall_readability_score.append( tstat.dale_chall_readability_score(email))
def main() : for arg in sys.argv[1:]: with open(arg) as f: text = f.read() with open(arg + '.readability.snip','w') as f: f.write ("syllable_count : %s\n" % textstat.syllable_count(text)) f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text)) f.write ("sentence_count : %s\n" % textstat.sentence_count(text)) f.write ("difficult_words : %s\n" % textstat.difficult_words(text)) f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text)) f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text)) f.write ("smog_index : %s\n" % textstat.smog_index(text)) f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text)) f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text)) f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text)) f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
def calculate2FormulaFromFile(inputFile, isTEI=1): inputData = extractText.extractTextTEI(inputFile, isTEI) inputData = re.sub('_', ' ', inputData) # r1 = textstat.flesch_kincaid_grade(inputData) # r2 = textstat.dale_chall_readability_score(inputData) # import pdb; pdb.set_trace() try: r1 = textstat.flesch_kincaid_grade(inputData) except: print('ERROR: cannot calculate flesch_kincaid_grade for ', inputFile) r1 = -1 try: r2 = textstat.dale_chall_readability_score(inputData) except: print('ERROR: cannot calculate dale_chall_readability_score for ', inputFile) r2 = -1 print('processing file', inputFile, 'complete') return (inputFile, r1, r2)
def analyze1(text): # Automatically reject if no input if text.isspace(): return -1.0 if text.startswith('http'): return -1.0 # Analyze text try: x = textstat.flesch_kincaid_grade(text) except: return -1.0 # Keep outputs valid if not isinstance(x, float): return -1.0 if x < 0: return -1.0 return x
def calculate_readability_measures(id): """ Count the words in doc and update the document. """ es = elasticsearch.Elasticsearch() source = es.get_source(index='beek', doc_type='page', id=id) # count = len(source['content'].split()) try: measures = { 'flesch': textstat.flesch_reading_ease(source['content']), 'smog': textstat.smog_index(source['content']), 'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']), 'coleman_liau': textstat.coleman_liau_index(source['content']), 'readability': textstat.automated_readability_index(source['content']), 'dale_chall': textstat.dale_chall_readability_score(source['content']), 'difficult_words': textstat.difficult_words(source['content']), 'linsear_write_formula': textstat.linsear_write_formula(source['content']), 'gunning_fog': textstat.gunning_fog(source['content']), 'consensus': textstat.readability_consensus(source['content']), } es.update(index='beek', doc_type='page', id=id, body={'doc': {'measures': measures}}, refresh=True) except Exception as err: pass
def fic2text(ident): textsegs = Loader.get_field(data['fics'],ident,'fic') rtags = Loader.get_field(data['base'],ident,'tags') rtext = "" for line in textsegs: line = line.replace(u'\xa0',' ') s = re.sub('([.,!?()])', r' \1 ', line) s = re.sub('\s{2,}', ' ', line) line = line.encode('ascii', 'ignore').decode('ascii') rtext += line+" " tags = [] for genre in rtags: for el in rtags[genre]: tname = el["name"] tags.append(tname) reading_ease = textstat.flesch_reading_ease(rtext) reading_level = textstat.flesch_kincaid_grade(rtext) print(ident,reading_ease,reading_level) #tokens = nltk.word_tokenize(rtext) return tags,rtext
import json from textstat.textstat import textstat filename = 'usertimeline.json' READ = 'rb' TEXT=1 tweets = json.load(open(filename,READ)) #Identify retweets retweets = [word for tweet in tweets for word in tweet['text'][TEXT] if 'RT' in word] print retweets #identify replies #Word count print [tweet['analysis']['word-count'] for tweet in tweets] #How would you do a character count? #Lexical diversity lex_div = lambda text: len(text.split())/float(len(set(text.split()))) print [lex_div(tweet['text'][TEXT]) for tweet in tweets] #F-K print [textstat.flesch_kincaid_grade(tweet['text'][TEXT]) for tweet in tweets] #remove stopwords print [[word for word in tweet['text'][TEXT].split() if word not in stopwords.words('english') ] for tweet in tweets] #What's another way to filter out stopwords? #How to handle punctuation?
def engineer_NLP_features(doc): """ Generate NLP fatures (related to language and sentiment) for mashable articles to be used in predicting no. of shares Arguements: doc: mongoDB document contating article content data Output: Stores NLP features results in MongoDB for Document """ # get article headline and article content from Mongo DB document headline = doc['title'] content = doc['content'].encode('utf-8') # generate headline features # number of words in title n_tokens_title = len(headline.split()) # subjectivity title_subjectivity = TextBlob(headline).subjectivity # polarity title_sentiment_polarity = TextBlob(headline).polarity # absolute value polarirty title_sentiment_abs_polarity = abs(title_sentiment_polarity) # average word length average_token_length_title = np.mean([len(w) for w in "".join(c for c in headline if c not in string.punctuation).split()]) #generate content features # number of words n_tokens_content = len([w for w in content.split()]) # rate of unique words r_unique_tokens = len(set([w.lower().decode('utf-8') for w in "".join(c for c in content if c not in string.punctuation).split()]))/n_tokens_content # rate of non-stop word r_non_stop_words = len([w.lower().decode('utf-8') for w in "".join(c for c in content if c not in string.punctuation).split() if w.decode('utf-8') not in stop])/n_tokens_content # rate of unique non-stop word r_non_stop_unique_tokens = len(set([w.lower().decode('utf-8') for w in "".join(c for c in content if c not in string.punctuation).split() if w.decode('utf-8') not in stop]))/n_tokens_content # average word length average_token_length_content = np.mean([len(w) for w in "".join(c for c in content if c not in string.punctuation).split()]) # subjectivity global_subjectivity = TextBlob(content.decode('utf-8')).subjectivity # polarity global_sentiment_polarity = TextBlob(content.decode('utf-8')).polarity # absolute polarity global_sentiment_abs_polarity = abs(global_sentiment_polarity) # get polarity by word polarity_list = [(w.decode('utf-8'), TextBlob(w.decode('utf-8')).polarity) for w in "".join(c for c in content if c not in string.punctuation).split()] # global positive word rate global_rate_positive_words = len([(w,p) for (w,p) in polarity_list if p > 0])/len(polarity_list) # global negative word rate global_rate_negative_words = len([(w,p) for (w,p) in polarity_list if p < 0])/len(polarity_list) # positive word rate (among non-nuetral words) if [(w,p) for (w,p) in polarity_list if p != 0]: rate_positive_words = len([(w,p) for (w,p) in polarity_list if p > 0])/len([(w,p) for (w,p) in polarity_list if p != 0]) else: rate_positive_words = 0 # negative word rate (among non-nuetral words) if [(w,p) for (w,p) in polarity_list if p != 0]: rate_negative_words = len([(w,p) for (w,p) in polarity_list if p < 0])/len([(w,p) for (w,p) in polarity_list if p != 0]) else: rate_negative_words = 0 # average polarity of positive words if [p for (w,p) in polarity_list if p > 0]: avg_positive_polarity = np.mean([p for (w,p) in polarity_list if p > 0]) else: avg_positive_polarity = 0 # minimum polarity of positive words if [p for (w,p) in polarity_list if p > 0]: min_positive_polarity = min([p for (w,p) in polarity_list if p > 0]) else: min_positive_polarity = 0 # maximum polarity of positive words if [p for (w,p) in polarity_list if p > 0]: max_positive_polarity = max([p for (w,p) in polarity_list if p > 0]) else: max_positive_polarity = 0 # average polarity of negative words if [p for (w,p) in polarity_list if p < 0]: avg_negative_polarity = np.mean([p for (w,p) in polarity_list if p < 0]) else: avg_negative_polarity = 0 # minimum polarity of negative words if [p for (w,p) in polarity_list if p < 0]: min_negative_polarity = min([p for (w,p) in polarity_list if p < 0]) else: min_negative_polarity = 0 # maximum polarity of negative words if [p for (w,p) in polarity_list if p < 0]: max_negative_polarity = max([p for (w,p) in polarity_list if p < 0]) else: max_negative_polarity = 0 # abs maximum polarity, sum of abs of max positive and abs of min negative polarity max_abs_polarity = max_positive_polarity + abs(min_negative_polarity) # Flesch Reading Ease global_reading_ease = textstat.flesch_reading_ease(content.decode('utf-8')) # Flesch Kincaid Grade Level global_grade_level = textstat.flesch_kincaid_grade(content.decode('utf-8')) collection.update_one({"_id": doc["_id"]}, {"$set": {"n_tokens_title": n_tokens_title, "title_subjectivity": title_subjectivity, "title_sentiment_polarity": title_sentiment_polarity, "title_sentiment_abs_polarity": title_sentiment_abs_polarity, "average_token_length_title": average_token_length_title, "n_tokens_content": n_tokens_content, "r_unique_tokens": r_unique_tokens, "r_non_stop_words": r_non_stop_words, "r_non_stop_unique_tokens": r_non_stop_unique_tokens, "average_token_length_content": average_token_length_content, "global_subjectivity": global_subjectivity, "global_sentiment_polarity": global_sentiment_polarity, "global_sentiment_abs_polarity": global_sentiment_abs_polarity, "global_rate_positive_words": global_rate_positive_words, "global_rate_negative_words": global_rate_negative_words, "rate_positive_words": rate_positive_words, "rate_negative_words": rate_negative_words, "avg_positive_polarity": avg_positive_polarity, "min_positive_polarity": min_positive_polarity, "max_positive_polarity": max_positive_polarity, "avg_negative_polarity": avg_negative_polarity, "min_negative_polarity": min_negative_polarity, "max_negative_polarity": max_negative_polarity, "max_abs_polarity": max_abs_polarity, "global_reading_ease": global_reading_ease, "global_grade_level": global_grade_level}})
target = open(writename, 'w') target.truncate() target.write(lyrics) target.close() # Build Dataset try: cur = { "title": title, "artist": artist, "year": year, "pos": pos, "lyrics": lyrics, "tags": get_tags(artist), "sentiment": sent_analyzer.polarity_scores(lyrics_repl), "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl), "flesch_index": ts.flesch_reading_ease(lyrics_repl), "fog_index": ts.gunning_fog(lyrics_repl), "difficult_words": ts.difficult_words(lyrics_repl), "num_syllables": ts.syllable_count(lyrics_repl), "num_words": ts.lexicon_count(lyrics_repl, True), "num_lines": ts.sentence_count(lyrics_repl), "num_dupes": count_dupes(lyrics) } # print cur dataset.append(cur) except Exception, e: print e except Exception, e: print "Exception occurred for " + artist + ' - ' + title
tweets = json.load(open(filename,READ)) #Identify retweets retweets = [word for tweet in tweets for word in tweet['text'][TEXT] if 'RT' in word] print retweets #identify replies #Word count print [tweet['analysis']['word-count'] for tweet in tweets] #How would you do a character count? #Lexical diversity lex_div = lambda text: len(text.split())/float(len(set(text.split()))) print [lex_div(tweet['text'][TEXT]) for tweet in tweets] #F-K FK = [] for tweet in tweets: print tweet['text'] try: FK.append(textstat.flesch_kincaid_grade(tweet['text'])) except: FK.append(None) print 'FK:', FK #remove stopwords print 'Removed stopwords:', [[word for word in tweet['text'].split() if word not in stopwords] for tweet in tweets] #What's another way to filter out stopwords? #How to handle punctuation?
def extract_features_sub(text, dialogue = True): ## aggregate all dialogue, action #scenes = format_script(file_name) if len(text) > 0: try: language_complexity = {'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'automated_readability_index': textstat.automated_readability_index(text)} except: language_complexity = {'flesch_reading_ease': None, 'flesch_kincaid_grade': None, 'automated_readability_index': None} else: #badD.write(movie_name + "\n") language_complexity = {'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0, 'automated_readability_index': 0} lexical_diversity = find_lex_d(text) sentiment = extract_senti_wordnet(text) #print sentiment inquirer_features = general_inquirer_features(text) final_features = {} final_features.update(language_complexity) final_features.update(lexical_diversity) final_features.update(sentiment) final_features.update(inquirer_features) curr_keys = [feature for feature in final_features] if dialogue: new_keys = [feature + "_" + "dialogue" for feature in final_features] else: new_keys = [feature + "_" + "action" for feature in final_features] #print final_features """ if dialogue: for feature in final_features: final_features[feature + "_dialogue"] = final_features.pop(feature) else: for feature in final_features: final_features[feature + "_action"] = final_features.pop(feature) #final_features = language_complexity + lexical_diversity + sentiment + inquirer_features """ return convert(final_features, dict(zip(curr_keys, new_keys)))
print len(reviews) reviews['scores'] = reviews['helpful'].apply(compute_score) print reviews['scores'].head(n=10) y = reviews['scores'] Text = reviews['reviewText'] del reviews X = np.zeros((len(Text), 4)) for idx, review in enumerate(Text): if review == '': continue try: X[idx][0] = ts.flesch_reading_ease(review) X[idx][1] = ts.flesch_kincaid_grade(review) X[idx][2] = ts.gunning_fog(review) X[idx][3] = ts.smog_index(review) except Exception as e: print review print e X = StandardScaler().fit_transform(X) print 'Computed X' print X[0] model = SVR(verbose=True) params = {'C': [0.1, 0.5]} grid = GridSearchCV(model, params, cv=10, scoring='mean_squared_error', n_jobs=-1) grid.fit(X, y) print grid.best_score_
# Coleman-Liau index: goo.gl/8sE0m1 cl_index_grades = [] cl_index_total_grade = 0 # Linsear Write Formula: goo.gl/GuOZ8B lwf_grades = [] lwf_total_grade = 0 # Dale-Chall Readability Score: goo.gl/dvmXmx dcr_grades = [] dcr_total_grade = 0 num_tweets = 0 for tweet in cleanest_tweets: # skipping tweets which are not just contextbased text. if textstat.sentence_count(tweet) < 1: continue flesch_kincaid_grade = textstat.flesch_kincaid_grade(tweet) flesch_kincaid_grades.append(flesch_kincaid_grade) flesch_kincaid_total_grade += flesch_kincaid_grade gunning_fog_grade = textstat.gunning_fog(tweet) gunning_fog_grades.append(gunning_fog_grade) gunning_fog_total_grade += gunning_fog_grade smog_index_grade = textstat.smog_index(tweet) smog_index_grades.append(smog_index_grade) smog_index_total_grade += smog_index_grade ar_index_grade = textstat.automated_readability_index(tweet) ar_index_grades.append(ar_index_grade) ar_index_total_grade += ar_index_grade
def reading_level(raw_text): print "Flesch Reading Ease: ",textstat.flesch_reading_ease(raw_text) print "Flesch-Kincaid Grade Level: ", textstat.flesch_kincaid_grade(raw_text) print "Average Sentence Length: ", textstat.avg_sentence_length(raw_text) print "Average Word Length: ", textstat.avg_letter_per_word(raw_text)
rcParams['text.usetex'] = True data = [line for line in csv.DictReader(open('comments.csv','rb'))] names = set([entry['Name'] for entry in data]) TAB = '\t' WRITE = 'wb' READ = 'rb' filename = 'FK-calculated' with open(filename,WRITE) as out: for name in names: #This measurement is confounded by lengths of the text text = ' '.join([entry['Student Comment'] for entry in data if entry['Name'] == name and entry['Student Comment'] != 'None']) try: grade_level = textstat.flesch_kincaid_grade(text) except: grade_level = -1 try: lex_div = len(text.split())/float(len(set(text.split()))) except: lex_div = -1 print>>out,'%s \t %.02f \t %.02f'%(name,grade_level,lex_div) names, grade_levels, lex_div= zip(*[line.split('\t') for line in open(filename,READ).read().splitlines()]) grade_levels = map(float,grade_levels) lex_div = map(float,lex_div) fig,(ax,ax2) = plt.subplots(nrows=1,ncols=2,sharey=True) ax.hist(grade_levels, bins=10,color='k')
def create_NLP_features(data, headline, content): """ Add NLP features to DF or Dictionary of data to be input to mashable models for prediction. Arguements: data: DateFrame or Dictionary headline: string containing article headline content: string containing article content """ # number of words in title data['n_tokens_title'] = len(headline.split()) # subjectivity data['title_subjectivity'] = TextBlob(headline).subjectivity # polarity data['title_sentiment_polarity'] = round(TextBlob(headline).polarity,2) # absolute value polarirty data['title_sentiment_abs_polarity'] = abs(data['title_sentiment_polarity']) # average word length data['average_token_length_title'] = np.mean([len(w) for w in "".join(c for c in headline if c not in string.punctuation).split()]) #generate content features # number of words data['n_tokens_content'] = len([w for w in content.split()]) # rate of unique words data['r_unique_tokens'] = round(len(set([w.lower().decode('utf-8') for w in "".join(c for c in content if c not in string.punctuation).split()]))/data['n_tokens_content'],2) # rate of non-stop word data['r_non_stop_words'] = len([w.lower().decode('utf-8') for w in "".join(c for c in content if c not in string.punctuation).split() if w.decode('utf-8') not in stop])/data['n_tokens_content'] # rate of unique non-stop word data['r_non_stop_unique_tokens'] = len(set([w.lower().decode('utf-8') for w in "".join(c for c in content if c not in string.punctuation).split() if w.decode('utf-8') not in stop]))/data['n_tokens_content'] # average word length data['average_token_length_content'] = np.mean([len(w) for w in "".join(c for c in content if c not in string.punctuation).split()]) # subjectivity data['global_subjectivity'] = TextBlob(content.decode('utf-8')).subjectivity # polarity data['global_sentiment_polarity'] = round(TextBlob(content.decode('utf-8')).polarity,2) # absolute polarity data['global_sentiment_abs_polarity'] = abs(data['global_sentiment_polarity']) # get polarity by word polarity_list = [(w.decode('utf-8'), TextBlob(w.decode('utf-8')).polarity) for w in "".join(c for c in content if c not in string.punctuation).split()] # global positive word rate data['global_rate_positive_words'] = len([(w,p) for (w,p) in polarity_list if p > 0])/len(polarity_list) # global negative word rate data['global_rate_negative_words'] = len([(w,p) for (w,p) in polarity_list if p < 0])/len(polarity_list) # positive word rate (among non-nuetral words) if [(w,p) for (w,p) in polarity_list if p != 0]: data['rate_positive_words'] = len([(w,p) for (w,p) in polarity_list if p > 0])/len([(w,p) for (w,p) in polarity_list if p != 0]) else: data['rate_positive_words'] = 0 # negative word rate (among non-nuetral words) if [(w,p) for (w,p) in polarity_list if p != 0]: data['rate_negative_words'] = len([(w,p) for (w,p) in polarity_list if p < 0])/len([(w,p) for (w,p) in polarity_list if p != 0]) else: data['rate_negative_words'] = 0 # average polarity of positive words if [p for (w,p) in polarity_list if p > 0]: data['avg_positive_polarity'] = np.mean([p for (w,p) in polarity_list if p > 0]) else: data['avg_positive_polarity'] = 0 # minimum polarity of positive words if [p for (w,p) in polarity_list if p > 0]: data['min_positive_polarity'] = min([p for (w,p) in polarity_list if p > 0]) else: data['min_positive_polarity'] = 0 # maximum polarity of positive words if [p for (w,p) in polarity_list if p > 0]: data['max_positive_polarity'] = max([p for (w,p) in polarity_list if p > 0]) else: data['max_positive_polarity'] = 0 # average polarity of negative words if [p for (w,p) in polarity_list if p < 0]: data['avg_negative_polarity'] = np.mean([p for (w,p) in polarity_list if p < 0]) else: data['avg_negative_polarity'] = 0 # minimum polarity of negative words if [p for (w,p) in polarity_list if p < 0]: data['min_negative_polarity'] = min([p for (w,p) in polarity_list if p < 0]) else: data['min_negative_polarity'] = 0 # maximum polarity of negative words if [p for (w,p) in polarity_list if p < 0]: data['max_negative_polarity'] = max([p for (w,p) in polarity_list if p < 0]) else: data['max_negative_polarity'] = 0 # abs maximum polarity, sum of abs of max positive and abs of min negative polarity data['max_abs_polarity'] = data['max_positive_polarity'] + abs(data['min_negative_polarity']) # Flesch Reading Ease data['global_reading_ease'] = textstat.flesch_reading_ease(content.decode('utf-8')) # Flesch Kincaid Grade Level data['global_grade_level'] = textstat.flesch_kincaid_grade(content.decode('utf-8'))
#location of full .csv with open(inFile,"rb") as source: rdr= csv.reader( source, delimiter = ',' ) #change path below to update output file location with open(outfile,"wb") as result: wtr= csv.writer( result, delimiter=',') i = 0 for r in rdr: #get the description from each field description = r[2] #try/catch to calculate grade level try: gradeLevel= textstat.flesch_kincaid_grade(description) except: gradeLevel = 'Unable to calculate!' #append grade level calculation to list r.append(gradeLevel) wtr.writerow(r) i+=1 #print obs to track execution in console print(i)
#!/bin/python import sys, string, os from textstat.textstat import textstat inputfile = '' test_data = "" script_name = sys.argv[0] inputfile = sys.argv[1] with open(inputfile) as myfile: test_data="".join(line.rstrip() for line in myfile) var1 = str(textstat.flesch_reading_ease(test_data)) var2 = str(textstat.smog_index(test_data)) var3 = str(textstat.flesch_kincaid_grade(test_data)) var4 = str(textstat.coleman_liau_index(test_data)) var5 = str(textstat.automated_readability_index(test_data)) var6 = str(textstat.dale_chall_readability_score(test_data)) var7 = str(textstat.difficult_words(test_data)) var8 = str(textstat.linsear_write_formula(test_data)) var9 = str(textstat.gunning_fog(test_data)) var10 = str(textstat.readability_consensus(test_data)) var11 = str(textstat.syllable_count(test_data)) var12 = str(textstat.lexicon_count(test_data, 1)) var13 = str(textstat.sentence_count(test_data)) print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)