def get_readability_scores(text): scores = { 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'gunning_fog': textstat.gunning_fog(text), 'smog_index': textstat.smog_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'linsear_write_formula': textstat.linsear_write_formula(text), 'dale_chall_readability_score': textstat.dale_chall_readability_score(text), 'text_standard': textstat.text_standard(text, float_output=True), 'difficult_words': textstat.difficult_words(text) / len(text.split()), } return scores
def analyze_vocab(text): return { 'num_words': textstat.lexicon_count(text), 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'smog_index': textstat.smog_index(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'dale_chall_readability_score': textstat.dale_chall_readability_score(text), 'difficult_words': textstat.difficult_words(text), 'linsear_write_formula': textstat.linsear_write_formula(text), 'gunning_fog': textstat.gunning_fog(text), 'text_standard': textstat.text_standard(text, float_output=True) }
def score_text(self, test_data): score = {} score['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data) score['smog_index'] = textstat.smog_index(test_data) score['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade( test_data) score['coleman_liau_index'] = textstat.coleman_liau_index(test_data) score[ 'automated_readability_index'] = textstat.automated_readability_index( test_data) score[ 'dale_chall_readability_score'] = textstat.dale_chall_readability_score( test_data) score['difficult_words'] = textstat.difficult_words(test_data) score['linsear_write_formula'] = textstat.linsear_write_formula( test_data) score['gunning_fog'] = textstat.gunning_fog(test_data) score['text_standard'] = textstat.text_standard(test_data) return score
def _extract_readability_scores(self, text: Text, scores=None) -> Dict: output = {} if scores == None or 'flesch_reading_ease' in scores: output['flesch_reading_ease'] = textstat.flesch_reading_ease(text) if scores == None or 'smog_index' in scores: output['smog_index'] = textstat.smog_index(text) if scores == None or 'flesch_kincaid_grade' in scores: output['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade( text) if scores == None or 'coleman_liau_index' in scores: output['coleman_liau_index'] = textstat.coleman_liau_index(text) if scores == None or 'automated_readability_index' in scores: output[ 'automated_readability_index'] = textstat.automated_readability_index( text) if scores == None or 'dale_chall_readability_score' in scores: output[ 'dale_chall_readability_score'] = textstat.dale_chall_readability_score( text) if scores == None or 'difficult_words' in scores: output['difficult_words'] = textstat.difficult_words(text) if scores == None or 'linsear_write_formula' in scores: output['linsear_write_formula'] = textstat.linsear_write_formula( text) if scores == None or 'gunning_fog' in scores: output['gunning_fog'] = textstat.gunning_fog(text) if scores == None or 'text_standard' in scores: output['text_standard'] = textstat.text_standard(text, float_output=True) return output
def text_analysis(test_data): #flesch_reading_ease: higher scores indicate material that is easier to read. aim for >60.0 print ('flesch_reading_ease: '+str(textstat.flesch_reading_ease(test_data))) #smog_index: Calculates US grade level print ('smog_index: '+str(textstat.smog_index(test_data))) #flesch_kincaid_grade: Calculates US grade level print ('flesch_kincaid_grade: '+str(textstat.flesch_kincaid_grade(test_data))) #Colman Liau: Calculates US grade level print ('coleman_liau_index: '+str(textstat.coleman_liau_index(test_data))) #automated_readability_index: Calculates US grade level print ('automated_readability_index: '+str(textstat.automated_readability_index(test_data))) #Dale Chall Readability Score: 0.1579(dificult words / words *100) + 0.0496(words/sentences) print ('dale_chall_readability_score: '+str(textstat.dale_chall_readability_score(test_data))) #number of difficult words print ('difficult_words: '+str(textstat.difficult_words(test_data))) #Linsear Write: Calculates the U.S. grade level of a text sample based on sentence length and the number of words with three or more syllables. print ('linsear_write_formula: '+str(textstat.linsear_write_formula(test_data))) #gunning_frog: The text can be understood by someone who left full-time education at a later age than the index print ('gunning_fog: '+str(textstat.gunning_fog(test_data))) #text_standard: Calculates US grade level print ('text_standard: '+str(textstat.text_standard(test_data)))
def print_readability(text_to_analyse, option='short'): if option == 'all': print( "flesch (0-29: confusing, 30-59: Difficult, 60-69: Standard, 70-100: Easy): ", textstat.flesch_reading_ease(text_to_analyse)) print("smog (years of education required): ", textstat.smog_index(text_to_analyse)) print( "flesch kinkaid (70-100: Fairly Easy; 60-70: Plain English; 30-60: Fairly Difficult; 30-0: Very Difficult): ", textstat.flesch_kincaid_grade(text_to_analyse)) print("coleman liau: ", textstat.coleman_liau_index(text_to_analyse)) print( "auto read (1-4: 5-10 years age; 5-8: 10-14 y; 9-12: 14-18 y; 13-14: 18+): ", textstat.automated_readability_index(text_to_analyse)) print("dale chall (< 5: kid; 5-8: scholar; 9-10: college): ", textstat.dale_chall_readability_score(text_to_analyse)) print("difficult words: ", textstat.difficult_words(text_to_analyse)) print("linsear write: ", textstat.linsear_write_formula(text_to_analyse)) print("gunning fog (9-12: High-school; 13-17: College): ", textstat.gunning_fog(text_to_analyse)) print("text standard (estimated school grade level): ", textstat.text_standard(text_to_analyse))
def readability(): text = "I am some really difficult text to read because I use obnoxiously large words." test_data = ("data/Job Bulletins/ACCOUNTANT 1513 062218.txt") all_files = os.listdir("data/Job Bulletins") all_files.sort() counter = 0 average = 0 for file_name in all_files: test_data = file_name # print(test_data) test_data_name = f"data/Job Bulletins/{test_data}" # print(test_data_name) a = textstat.flesch_reading_ease(test_data_name) # counter += 1 '''Score Difficulty 90-100 Very Easy 80-89 Easy 70-79 Fairly Easy 60-69 Standard 50-59 Fairly Difficult 30-49 Difficult 0-29 Very Confusing ''' b = textstat.smog_index(test_data) c = textstat.flesch_kincaid_grade(test_data) d = textstat.coleman_liau_index(test_data) e = textstat.automated_readability_index(test_data) f = textstat.dale_chall_readability_score(test_data) '''Score Understood by 4.9 or lower average 4th-grade student or lower 5.0–5.9 average 5th or 6th-grade student 6.0–6.9 average 7th or 8th-grade student 7.0–7.9 average 9th or 10th-grade student 8.0–8.9 average 11th or 12th-grade student 9.0–9.9 average 13th to 15th-grade (college) student ''' g = textstat.difficult_words(test_data) h = textstat.linsear_write_formula(test_data) i = textstat.gunning_fog(test_data) j = textstat.text_standard(test_data) k = textstat.syllable_count(text, lang='en_US') l = textstat.lexicon_count(text, removepunct=True) m = textstat.gunning_fog(text) n = textstat.text_standard(text, float_output=False) counter += a average = counter / 683 # print(average) my_list = [a] return (jsonify(my_list))
print(len(words)) # remove the stopwords words = [word for word in words if word not in stw] lyrics_no_sw = [ word for word in words if word not in stopwords.words('english') ] # Calculate the total number of words ttl_words = len(words) # Calculate the total number of sentences docReader = nltk.corpus.PlaintextCorpusReader('./', artist + '.txt') sentences = len(docReader.sents()) # Calculate the total number of difficult words diff_words_count = textstat.difficult_words(raw_text) # Calculate readability-- Gunning Fog dif_words = (diff_words_count / ttl_words * 100) gf_read = 0.4 * (float(ttl_words / sentences) + dif_words) # Calculate readability-- SMOG poly_syl = 0 for word in words: syl_count = textstatistics().syllable_count(word) if syl_count >= 3: poly_syl += 1 SMOG = (1.043 * (30 * (poly_syl / sentences))**0.5) + 3.1291 smog_read = legacy_round(SMOG, 1) # Calculate readability-- Linsear Write
cap_words = word_tokenize(review_captial) cap_words = [ w for w in cap_words if w not in ['.', ',', ';', '?', ':', '!', '"', "'", '#'] ] for w in cap_words: if w[0].isupper(): cnt += 1 capital_count.append(cnt / len(cap_words)) #obatining readability features reviews[i] = reviews[i].strip().lower().replace("\'", '') kingrade.append(textstat.flesch_kincaid_grade(reviews[i])) gunning.append(textstat.gunning_fog(reviews[i])) flesch_reading_ease1.append(textstat.flesch_reading_ease(reviews[i])) difficult_words1.append(textstat.difficult_words(reviews[i])) smog_index1.append(textstat.smog_index(reviews[i])) automated_readability_index1.append( textstat.automated_readability_index(reviews[i])) coleman_liau_index1.append(textstat.coleman_liau_index(reviews[i])) linsear_write_formula1.append( textstat.linsear_write_formula(reviews[i])) dale_chall_readability_score1.append( textstat.dale_chall_readability_score(reviews[i])) word_freq = [] #obtaining punctuation count words = word_tokenize(reviews[i]) punct = [w for w in words if w in ['.', ',', ';', '?', ':', '!']] punct_count.append(len(punct) / len(words))
def download(request): global tweetsList response = HttpResponse(content_type='application/x-download') response['Content-Disposition'] = 'attachment; filename="tweets.csv"' #set headers of csv fieldnames = ['datetime', 'last updated', 'original username', 'original screen name', 'original user location', 'original user verified', 'retweet', 'retweeter username', 'retweeter screen name', 'retweeter location', 'retweeter verified', 'text', 'comment', # 'hashtags', 'urls', '#retweets','#favorites', '#retweets of retweet', 'hashtags', 'urls', '#retweets', '#favorites', '#favorites of retweet', 'original syllable count', 'original lexicon count', 'original sentence count', 'original flesch reading ease score', 'original flesch-kincaid grade level', 'original fog scale', 'original smog index', 'original automated readability index', 'original coleman-liau index', 'original linsear write level', 'original dale-chall readability score', 'original difficult words', 'original readability consensus', 'original neg sentiment', 'original neu sentiment', 'original pos sentiment', 'original overall sentiment', 'comment syllable count', 'comment lexicon count', 'comment sentence count', 'comment flesch reading ease score', 'comment flesch-kincaid grade level', 'comment fog scale', 'comment smog index', 'comment automated readability index', 'comment coleman-liau index', 'comment linsear write level', 'comment dale-chall readability score', 'comment difficult words', 'comment readability consensus', 'comment neg sentiment', 'comment neu sentiment', 'comment pos sentiment', 'comment overall sentiment', 'combined syllable count', 'combined lexicon count', 'combined sentence count', 'combined flesch reading ease score', 'combined flesch-kincaid grade level', 'combined fog scale', 'combined smog index', 'combined automated readability index', 'combined coleman-liau index', 'combined linsear write level', 'combined dale-chall readability score', 'combined difficult words', 'combined readability consensus', 'combined neg sentiment', 'combined neu sentiment', 'combined pos sentiment', 'combined overall sentiment', 'twitter users query', 'twitter excluded users query', 'twitter hashtags query', 'twitter keywords query', 'twitter from date query', 'twitter to date query'] writer = csv.writer(response, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) writer.writerow(fieldnames) for tweet in tweetsList: #combine hashtags of tweet into string separated by commas hashtagString = "" tweetHashtags = HashtagLog.objects.filter(tweet__id=tweet.id) for i in range(len(tweetHashtags)): if i == 0: hashtagString += tweetHashtags[i].hashtag.hashtagText else: hashtagString += ", " + tweetHashtags[i].hashtag.hashtagText #combine urls of tweet into string separated by commas urlString = "" tweetUrls = UrlLog.objects.filter(tweet__id=tweet.id) for i in range(len(tweetUrls)): if i == 0: urlString += tweetUrls[i].url.urlText else: urlString += ", " + tweetUrls[i].url.urlText #display yes or no in verified column for original user if tweet.originalUser.isVerified: originalVerifiedString = "yes" else: originalVerifiedString = "no" #if not a retweet, new user fields should be empty newUsername = None newScreenName = None newLocation = None newVerifiedString = None #if retweet: #display yes or no in verified column for new user if tweet.newUser: if tweet.newUser.isVerified: newVerifiedString = "yes" else: newVerifiedString = "no" #set retweet fields newUsername = tweet.newUser.username newScreenName = tweet.newUser.screenName newLocation = tweet.newUser.location #display yes or no in retweet column if tweet.isRetweet: isRetweetString = "yes" else: isRetweetString = "no" #get sentiment scores of original text sid_obj = SentimentIntensityAnalyzer() sentiment_dict_original = sid_obj.polarity_scores(tweet.originalText) #combine comment text and original tezt and get sentiment scores for the combination commentText = "" if tweet.commentText: commentText = tweet.commentText sentiment_dict_combined = sid_obj.polarity_scores(tweet.originalText + commentText) #intialize all comment word processing to empty strings in case there is no comment text cSyllableCount = "" cLexiconCount = "" cSentenceCount = "" cFleschReadingEase = "" cFleschKincaidGrade = "" cGunningFog = "" cSmogIndex = "" cAutomatedReadabilityIndex = "" cColemanLiauIndex = "" cLinsearWriteFormula = "" cDaleChallReadabilityScore = "" cDifficultWords = "" cTextStandard = "" #if there is comment text, get language processing stats for comment text if tweet.commentText != None: cSyllableCount = textstat.syllable_count(tweet.commentText, lang='en_US') cLexiconCount = textstat.lexicon_count(tweet.commentText, removepunct=True) cSentenceCount = textstat.sentence_count(tweet.commentText) cFleschReadingEase = textstat.flesch_reading_ease(tweet.commentText) cFleschKincaidGrade = textstat.flesch_kincaid_grade(tweet.commentText) cGunningFog = textstat.gunning_fog(tweet.commentText) cSmogIndex = textstat.smog_index(tweet.commentText) cAutomatedReadabilityIndex = textstat.automated_readability_index(tweet.commentText) cColemanLiauIndex = textstat.coleman_liau_index(tweet.commentText) cLinsearWriteFormula = textstat.linsear_write_formula(tweet.commentText) cDaleChallReadabilityScore = textstat.dale_chall_readability_score(tweet.commentText) cDifficultWords = textstat.difficult_words(tweet.commentText) cTextStandard = textstat.text_standard(tweet.commentText, float_output=False) #get sentiment scores for comment text cNegSent = "" cNeuSent = "" cPosSent = "" cCompoundSent = "" if tweet.commentText: sentiment_dict_comment = sid_obj.polarity_scores(tweet.commentText) cNegSent = sentiment_dict_comment['neg'] cNeuSent = sentiment_dict_comment['neu'] cPosSent = sentiment_dict_comment['pos'] cCompoundSent = sentiment_dict_comment['compound'] #write all information about the tweet, and its language processing stats to row in csv writer.writerow( [tweet.createdAt, tweet.lastUpdated, tweet.originalUser.username, tweet.originalUser.screenName, tweet.originalUser.location, originalVerifiedString, isRetweetString, newUsername, newScreenName, newLocation, newVerifiedString, tweet.originalText, tweet.commentText, hashtagString, urlString, tweet.numRetweetsOriginal, # tweet.numFavoritesOriginal, tweet.numRetweetsNew, tweet.numFavoritesNew, tweet.numFavoritesOriginal, tweet.numFavoritesNew, textstat.syllable_count(tweet.originalText, lang='en_US'), textstat.lexicon_count(tweet.originalText, removepunct=True), textstat.sentence_count(tweet.originalText), textstat.flesch_reading_ease(tweet.originalText), textstat.flesch_kincaid_grade(tweet.originalText), textstat.gunning_fog(tweet.originalText), textstat.smog_index(tweet.originalText), textstat.automated_readability_index(tweet.originalText), textstat.coleman_liau_index(tweet.originalText), textstat.linsear_write_formula(tweet.originalText), textstat.dale_chall_readability_score(tweet.originalText), textstat.difficult_words(tweet.originalText), textstat.text_standard(tweet.originalText, float_output=False), sentiment_dict_original['neg'], sentiment_dict_original['neu'], sentiment_dict_original['pos'], sentiment_dict_original['compound'], cSyllableCount, cLexiconCount, cSentenceCount, cFleschReadingEase, cFleschKincaidGrade, cGunningFog, cSmogIndex, cAutomatedReadabilityIndex, cColemanLiauIndex, cLinsearWriteFormula, cDaleChallReadabilityScore, cDifficultWords, cTextStandard, cNegSent, cNeuSent, cPosSent, cCompoundSent, textstat.syllable_count(tweet.originalText + commentText, lang='en_US'), textstat.lexicon_count(tweet.originalText + commentText, removepunct=True), textstat.sentence_count(tweet.originalText + commentText), textstat.flesch_reading_ease(tweet.originalText + commentText), textstat.flesch_kincaid_grade(tweet.originalText + commentText), textstat.gunning_fog(tweet.originalText + commentText), textstat.smog_index(tweet.originalText + commentText), textstat.automated_readability_index(tweet.originalText + commentText), textstat.coleman_liau_index(tweet.originalText + commentText), textstat.linsear_write_formula(tweet.originalText + commentText), textstat.dale_chall_readability_score(tweet.originalText + commentText), textstat.difficult_words(tweet.originalText + commentText), textstat.text_standard(tweet.originalText + commentText, float_output=False), sentiment_dict_combined['neg'], sentiment_dict_combined['neu'], sentiment_dict_combined['pos'], sentiment_dict_combined['compound'], tweet.twitterQueryUsers, tweet.twitterQueryNotUsers, tweet.twitterQueryHashtags, tweet.twitterQueryKeywords, tweet.twitterQueryFromDate, tweet.twitterQueryToDate] ) return response
def ReadabilityFeatureGenerator(df): """ Computes various readability features of news content. Input: DataFrame Returns list of readability features """ t0 = time() print("\n---Generating Readability Features:---") def lexical_diversity(text): word_count = len(text) vocab_size = len(set(text)) diversity_score = word_count / vocab_size return diversity_score def get_counts(text, word_list): words = nltk.tokenize.word_tokenize(text.lower()) count = 0 for word in words: if word in word_list: count += 1 return count df['flesch_reading_ease'] = df['articleBody'].map( lambda x: textstat.flesch_reading_ease(x)) df['smog_index'] = df['articleBody'].map(lambda x: textstat.smog_index(x)) df['flesch_kincaid_grade'] = df['articleBody'].map( lambda x: textstat.flesch_kincaid_grade(x)) df['coleman_liau_index'] = df['articleBody'].map( lambda x: textstat.coleman_liau_index(x)) df['automated_readability_index'] = df['articleBody'].map( lambda x: textstat.automated_readability_index(x)) df['dale_chall_readability_score'] = df['articleBody'].map( lambda x: textstat.dale_chall_readability_score(x)) df['difficult_words'] = df['articleBody'].map( lambda x: textstat.difficult_words(x)) df['linsear_write_formula'] = df['articleBody'].map( lambda x: textstat.linsear_write_formula(x)) df['gunning_fog'] = df['articleBody'].map( lambda x: textstat.gunning_fog(x)) df['i_me_myself'] = df['articleBody'].apply(get_counts, args=(['i', 'me', 'myself'], )) df['punct'] = df['articleBody'].apply(get_counts, args=([',', '.', '!', '?'], )) df['lexical_diversity'] = df['articleBody'].apply(lexical_diversity) feats = [ 'flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index', 'automated_readability_index', 'dale_chall_readability_score', 'difficult_words', 'linsear_write_formula', 'gunning_fog', 'i_me_myself', 'punct', 'lexical_diversity' ] xReadable = df[feats].values print('xReadable.shape: ', xReadable.shape) print('---Readability Features is complete---') print("Time taken {} seconds\n".format(time() - t0)) return [xReadable]
import textstat # reference: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests # reference: https://pypi.org/project/textstat/ # url = https://www.omio.com/trains/paris/london text = "Trains in Europe are a convenient way of traveling between cities, with a number of train companies offering domestic and international routes. Trains from Paris to London are incredibly fast and convenient, with high-speed Eurostar trains traveling between the cities in about 2.5 hours. Eurostar trains depart from Paris frequently during the day from early morning until evening, providing plenty of travel options. Paris to London train travel time ranges between 2 hours 15 minutes and 2 hours 30 minutes, depending on the chosen departure time. Eurostar's Paris to London trains leave from Paris Gare du Nord station and travel directly to London St Pancras International. Peak times tend to be in the morning around 8:30 a.m. and 4:30 p.m. in the afternoon." print "Flesch Reading Ease: " + str(textstat.flesch_reading_ease(text)) print "Flesch Kincaid Grade: " + str(textstat.flesch_kincaid_grade(text)) print "Smog Index: " + str(textstat.smog_index(text)) print "Coleman Liau Index: " + str(textstat.coleman_liau_index(text)) print "Automated Readability Index: " + str( textstat.automated_readability_index(text)) print "Dale Chall Readability Score: " + str( textstat.dale_chall_readability_score(text)) print "Difficult Words: " + str(textstat.difficult_words(text)) print "Linsear Write Formula: " + str(textstat.linsear_write_formula(text)) print "Gunning Fog: " + str(textstat.gunning_fog(text)) print "Text Standard: " + str(textstat.text_standard(text)) print "==========\n" # url = https://de.omio.com/bahn/berlin/amsterdam-rdudx textstat.set_lang('de_DE') text = "Bahn von Berlin nach Amsterdam \ Tagtäglich verbinden mehrere Schnellzüge der Deutschen Bahn Berlin mit Amsterdam auf direktem Wege: Einfach am Hauptbahnhof einsteigen, entspannt zurücklehnen und nach 6,5 Stunden im Herzen der niederländischen Metropole aussteigen. Zusätzlich bestehen zahlreiche genauso schnelle Verbindungen mit einmaligem Umsteigen in Hannover oder Duisburg. Einfacher und komfortabler geht es kaum! \ Welche Bahngesellschaften fahren von von Berlin nach Amsterdam? \ Alle Verbindungen mit der Bahn von Berlin nach Amsterdam werden von der Deutschen Bahn angeboten. Fast täglich fahren 7 ICs der DB direkt vom Berliner Hauptbahnhof nach Amsterdam Centraal. Der erste IC verlässt Berlin bereits in den frühen Morgenstunden und erreicht mittags Amsterdam. Der letzte IC fährt am späten Nachmittag in Berlin ab und kommt noch vor Mitternacht in Amsterdam an. \ Daneben besteht die Möglichkeit, in Hannover oder Duisburg in ICs oder ICEs nach Amsterdam umzusteigen. Wer die längere Dauer und das mehrmalige Umsteigen nicht scheut, kann auch die landschaftliche attraktive Route über Ostfriesland wählen: Dabei führt die Fahrt mit der Bahn von Berlin nach Amsterdam über Hamburg, Bremen, Leer/Ostfriesland, Groningen und Almere. \ \ Wie lange dauert die Bahnfahrt von Berlin nach Amsterdam? \ Die direkte Fahrt mit der Bahn von Berlin nach Amsterdam dauert exakt 6,5 Stunden. Doch auch die Umsteigeverbindungen über Hannover oder Duisburg benötigen nicht länger für die gesamte Strecke, da die Bahn bis Hannover bzw. Duisburg weniger Zwischenhalte einlegt. \
"SUNDAY, JANUARY 13, 2019. Additional instructions will be sent via e-mail. " "Candidates who fail to complete the advisory essay as instructed may be disqualified." "The multiple-choice test will be proctored and administered on-line during a single session. " "Candidates invited to participate in the on-line multiple-choice test will be able to take the test " "as instructed from a remote location using a computer with a webcam and a reliable internet connection. " "Candidates will receive an e-mail from the City of Los Angeles outlining the dates and " "specific steps on how to take the multiple-choice test and advisory essay on-line" ) textstat.flesch_reading_ease(test_data) textstat.smog_index(test_data) textstat.flesch_kincaid_grade(test_data) textstat.coleman_liau_index(test_data) textstat.automated_readability_index(test_data) textstat.dale_chall_readability_score(test_data) textstat.difficult_words(test_data) textstat.linsear_write_formula(test_data) textstat.gunning_fog(test_data) textstat.text_standard(test_data) # In[ ]: # Let's take another sample df_opening_pdfs.head() # Clean up the pdf opening names df_openings = pd.DataFrame()
import textstat #import readtime test_data = "Hello world. Welcome to my home!" print("Flesch Reading Ease : " + str(textstat.flesch_reading_ease(test_data))) print("Smog Index : " + str(textstat.smog_index(test_data))) print("Flesch Kincaid Grade : " + str(textstat.flesch_kincaid_grade(test_data))) print("Coleman Liau Index : " + str(textstat.coleman_liau_index(test_data))) print("Automated Readibility Index : " + str(textstat.automated_readability_index(test_data))) print("Dale Chall Readability Score : " + str(textstat.dale_chall_readability_score(test_data))) print("Difficult Words : " + str(textstat.difficult_words(test_data))) print("Linsear Write Formula : " + str(textstat.linsear_write_formula(test_data))) print("Gunning Fog : " + str(textstat.gunning_fog(test_data))) print("Text Stamdard : " + str(textstat.text_standard(test_data))) res = len(test_data.split()) print("Word Count : " + str(res)) """ Average reading speed of an adult - roughly 256 words per minute 256 words can be read in 60 seconds 1 word can be read in (60/256) seconds n words can be read in (60/256) * n seconds
def get_difficult_words(text): return textstat.difficult_words(text.text)
def test_difficult_words(): textstat.set_lang("en_US") result = textstat.difficult_words(long_test) assert result == 49
def test_difficult_words(): result = textstat.difficult_words(long_test) assert result == 49
def process(self, df): t0 = time() print("\n---Generating Readability Features:---\n") def lexical_diversity(text): word_count = len(text) vocab_size = len(set(text)) diversity_score = word_count / vocab_size return diversity_score def get_counts(text, word_list): words = nltk.tokenize.word_tokenize(text.lower()) count = 0 for word in words: if word in word_list: count += 1 return count df['flesch_reading_ease'] = df['articleBody'].astype(str).map( lambda x: textstat.flesch_reading_ease(x)) print('flesch_reading_ease done!') df['smog_index'] = df['articleBody'].astype(str).map( lambda x: textstat.smog_index(x)) print('smog_index done!') df['flesch_kincaid_grade'] = df['articleBody'].astype(str).map( lambda x: textstat.flesch_kincaid_grade(x)) print('flesch_kincaid_grade done!') df['coleman_liau_index'] = df['articleBody'].astype(str).map( lambda x: textstat.coleman_liau_index(x)) print('coleman_liau_index done!') df['automated_readability_index'] = df['articleBody'].astype(str).map( lambda x: textstat.automated_readability_index(x)) print('automated_readability_index done!') df['dale_chall_readability_score'] = df['articleBody'].astype(str).map( lambda x: textstat.dale_chall_readability_score(x)) print('dale_chall_readability_score done!') df['difficult_words'] = df['articleBody'].astype(str).map( lambda x: textstat.difficult_words(x)) print('difficult_words done!') df['linsear_write_formula'] = df['articleBody'].astype(str).map( lambda x: textstat.linsear_write_formula(x)) print('linsear_write_formula done!') df['gunning_fog'] = df['articleBody'].astype(str).map( lambda x: textstat.gunning_fog(x)) print('gunning_fog done!') df['i_me_myself'] = df['articleBody'].astype(str).apply( get_counts, args=(['i', 'me', 'myself'], )) print('i_me_myself done!') df['punct'] = df['articleBody'].astype(str).apply( get_counts, args=([',', '.', '!', '?'], )) print('punct done!') df['lexical_diversity'] = df['articleBody'].astype(str).apply( lexical_diversity) print('lexical_diversity done!') feats = [ 'flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade', 'coleman_liau_index', 'automated_readability_index', 'dale_chall_readability_score', 'difficult_words', 'linsear_write_formula', 'gunning_fog', 'i_me_myself', 'punct', 'lexical_diversity' ] outfilename_xReadable = df[feats].values with open('../saved_data/kaggle/read.pkl', 'wb') as outfile: pickle.dump(feats, outfile, -1) pickle.dump(outfilename_xReadable, outfile, -1) print('readable features saved in read.pkl') print('\n---Readability Features is complete---') print("Time taken {} seconds\n".format(time() - t0)) return 1
def get_article_features(title, text, nlp): ''' The Above Function Returns Json Object for the String Input. Takes two Parameter ''' def preprocess(sentence): sentence = sentence.lower() tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sentence) return " ".join(tokens) def lexical_diversity(text): ''' Returns the diversity of the string. ''' tokens = word_tokenize(text) word_count = len(tokens) vocab_size = len(set(tokens)) diversity_score = vocab_size / word_count return (diversity_score * 100) def freq_dist_sentence(text, stop_flag=False): ''' Returns word count for Each Sentence ''' text = preprocess(text) tokenized_word = word_tokenize(text) # with Stop Flag enabled if stop_flag: stop_words = set(stopwords.words("english")) tokenized_word = [ x for x in tokenized_word if (x not in stop_words and x.isalpha()) ] fdist = FreqDist(tokenized_word) return fdist def polarity_sc(text): # Returns dictionary of Polarity Score. Vader Intensity Analyzer sid = SentimentIntensityAnalyzer() scores = sid.polarity_scores(text) return scores def reading_standard(text): x = textstat.text_standard(text) match = re.search(r'(.?\d+)th(\s\w{3}\s((.?\d+)))?', x) r_stan = [] if match: r_stan.append(match.group(1)) r_stan.append(match.group(3)) return r_stan def spacy_vizualizer(title, text, nlp): ''' Returns Graphs of NER and Dependency Parse. Return Format is HTML ''' text = nlp(text) title = nlp(title) html_dep = displacy.render(title, style='dep', page=True) html_ent = displacy.render(text, style='ent', minify=True) dep = html_dep ent = html_ent print(dep) return (dep, ent) result = {} #Result = [] Text = preprocess(text) Title = preprocess(title) result['difficult_words'] = textstat.difficult_words(Text) result['word_count'] = len(word_tokenize(Text)) result['lexical_diversity'] = lexical_diversity(Text) result['word_dist'] = dict(freq_dist_sentence(Text).most_common()) result['word_dist_without_stopwords'] = dict( freq_dist_sentence(Text, stop_flag=True).most_common()) result['polarity_title_pos'] = polarity_sc(Title)['pos'] * 100 result['polarity_title_neg'] = polarity_sc(Title)['neg'] * 100 result['polarity_title_neu'] = polarity_sc(Title)['neu'] * 100 result['reading_standard'] = reading_standard(text) result['dependency_html'], result['ner_html'] = spacy_vizualizer( title, text, nlp) return result
lambda x: x.count("?") / len(x.split()), lambda x: x.count("-") / len(x.split()), lambda x: x.count(",") / len(x.split()), lambda x: x.count("$") / len(x.split()), lambda x: x.count("(") / len(x.split()), lambda x: len(x) / (x.count(" ") + 1), lambda x: x.count(" ") / (x.count(".") + 1), lambda x: len(re.findall("\d", x)), lambda x: len(re.findall("[A-Z]", x)), lambda x: textstat.flesch_reading_ease(x), lambda x: textstat.smog_index(x), lambda x: textstat.flesch_kincaid_grade(x), lambda x: textstat.coleman_liau_index(x), lambda x: textstat.automated_readability_index(x), lambda x: textstat.dale_chall_readability_score(x), lambda x: textstat.difficult_words(x), lambda x: textstat.linsear_write_formula(x), lambda x: textstat.gunning_fog(x), ] # Apply each function and put the results into a list. columns = [] for func in transform_functions: columns.append(df["text"].apply(func)) # Convert the meta features to a numpy array. meta = np.asarray(columns).T ##features = np.hstack([ meta,chi_matrix.todense()]) features = np.hstack([meta, tfidf.todense()]) ##features=tfidf
def __init__(self): test_data = ( '''Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply. I believe that playing games is every bit as important for adults as for children. Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension. ''') print( 'flesch_reading_ease', textstat.flesch_reading_ease(test_data), ) print( 'flesch_kincaid_grade', textstat.flesch_kincaid_grade(test_data), ) print( 'difficult_words', textstat.difficult_words(test_data), ) print( 'automated_readability_index', textstat.automated_readability_index(test_data), ) print( 'text_standard', textstat.text_standard(test_data), ) print( 'smog_index', textstat.smog_index(test_data), ) print( 'gunning_fog', textstat.gunning_fog(test_data), ) print( 'coleman_liau_index', textstat.coleman_liau_index(test_data), ) print( 'dale_chall_readability_score', textstat.dale_chall_readability_score(test_data), ) print( 'linsear_write_formula', textstat.linsear_write_formula(test_data), ) print( 'fernandez_huerta', textstat.fernandez_huerta(test_data), ) print( 'szigriszt_pazos', textstat.szigriszt_pazos(test_data), ) print( 'gutierrez_polini', textstat.gutierrez_polini(test_data), ) print( 'crawford', textstat.crawford(test_data), ) print('=========') blob = TextBlob(test_data) for sentence in blob.sentences: print(sentence, sentence.sentiment.polarity)
def getneurograde(): composition = textbox.get("1.0", END) neuro_flesch_reading_ease = str(textstat.flesch_reading_ease(composition)) print('Flesch Reading Ease : ' + neuro_flesch_reading_ease) output1 = tk.Label(root, text='Flesch Reading Ease : ' + neuro_flesch_reading_ease, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 500, window=output1) neuro_flesch_kincaid_grade = str( textstat.flesch_kincaid_grade(composition)) print('Flesch Kincaid Grade : ' + neuro_flesch_reading_ease) output2 = tk.Label(root, text='Flesch Kincaid Grade : ' + neuro_flesch_kincaid_grade, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 520, window=output2) neuro_smog_index = str(textstat.smog_index(composition)) print('Smog Index : ' + neuro_smog_index) output3 = tk.Label(root, text='Smog Index : ' + neuro_smog_index, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 540, window=output3) neuro_coleman_liau_index = str(textstat.coleman_liau_index(composition)) print('Coleman Liau Index : ' + neuro_coleman_liau_index) output4 = tk.Label(root, text='Coleman Liau Index : ' + neuro_coleman_liau_index, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 560, window=output4) neuro_automated_readability_index = str( textstat.automated_readability_index(composition)) print('Automated Readability Index : ' + neuro_automated_readability_index) output5 = tk.Label(root, text='Automated Readibility Index : ' + neuro_automated_readability_index, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 580, window=output5) neuro_dale_chall_readability_score = str( textstat.dale_chall_readability_score(composition)) print('Dale Chall Readability Score : ' + neuro_dale_chall_readability_score) output6 = tk.Label(root, text='Dale Chall Readability Score : ' + neuro_dale_chall_readability_score, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 600, window=output6) neuro_difficult_words = str(textstat.difficult_words(composition)) print('Difficult Words : ' + neuro_difficult_words) output7 = tk.Label(root, text='Difficult Words : ' + neuro_difficult_words, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 620, window=output7) neuro_linsear_write_formula = str( textstat.linsear_write_formula(composition)) print('Linsear Write Formula : ' + neuro_linsear_write_formula) output8 = tk.Label(root, text='Linsear Write Formula : ' + neuro_linsear_write_formula, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 640, window=output8) neuro_gunning_fog = str(textstat.gunning_fog(composition)) print('Gunning Fog : ' + neuro_gunning_fog) output9 = tk.Label(root, text='Gunning Fog : ' + neuro_gunning_fog, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 660, window=output9) neuro_text_standard = str(textstat.text_standard(composition)) print('Text Standard : ' + neuro_text_standard) output10 = tk.Label(root, text='Text Standard : ' + neuro_text_standard, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 680, window=output10) word_count = len(composition.split()) neuro_word_count = str(word_count) output11 = tk.Label(root, text='Word Count : ' + neuro_word_count, font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 700, window=output11) reading_time = (60 / 256) * word_count neuro_reading_time = str(reading_time) output12 = tk.Label(root, text='Average Reading Time : ' + neuro_reading_time + ' seconds', font=('helvetica', 10), bg="#a4de02") canvas1.create_window(400, 720, window=output12)
def test_difficult_words(self): result = textstat.difficult_words(self.long_test) self.assertEqual(49, result)
def testing_tfidf_fe(s): s = clean_text(s) df = pd.DataFrame(columns=['text']) df.loc[0] = [s] with open('my_dumped_classifier.pkl', 'rb') as fid: gnb_loaded = cPickle.load(fid) f = open("glove.6B.100d.txt", "r", encoding="utf8") glove = [] r = f.readlines() stop_words = stopwords.words("english") for i in r: if (i.split()[0] not in stop_words): glove.append(i.split()[0]) f.close() transform_functions = [ lambda x: x.count(" ") / len(x.split()), lambda x: x.count(".") / len(x.split()), lambda x: x.count("!") / len(x.split()), lambda x: x.count("?") / len(x.split()), lambda x: x.count("-") / len(x.split()), lambda x: x.count(",") / len(x.split()), lambda x: x.count("$") / len(x.split()), lambda x: x.count("(") / len(x.split()), lambda x: len(x) / (x.count(" ") + 1), lambda x: x.count(" ") / (x.count(".") + 1), lambda x: len(re.findall("\d", x)), lambda x: len(re.findall("[A-Z]", x)), lambda x: textstat.flesch_reading_ease(x), lambda x: textstat.smog_index(x), lambda x: textstat.flesch_kincaid_grade(x), lambda x: textstat.coleman_liau_index(x), lambda x: textstat.automated_readability_index(x), lambda x: textstat.dale_chall_readability_score(x), lambda x: textstat.difficult_words(x), lambda x: textstat.linsear_write_formula(x), lambda x: textstat.gunning_fog(x), ] transformer = TfidfTransformer(smooth_idf=True) count_vectorizer = CountVectorizer(ngram_range=(2, 3), vocabulary=glove) counts = count_vectorizer.fit_transform(df['text'].values) tfidf = transformer.fit_transform(counts) columns = [] for func in transform_functions: columns.append(df["text"].apply(func)) meta = np.asarray(columns).T features = np.hstack([meta, tfidf.todense()]) with open('my_dumped_classifier.pkl', 'rb') as fid: gnb_loaded = cPickle.load(fid) ans = gnb_loaded.predict(features)[0] if (ans == 0): print("Article is legitimate according to classifier") else: print("Article is fake according to classifiers") return ans