def most_verbose(data_word): """Finds long multi-syllable words and outputs a dataframe with values.""" verbose_words = [] synonyms = [] #looping through words to find complex words and their synonyms for word in data_word: #finding complex words and recording word & lemma if textstat.syllable_count(word) > 3: word_syn = wordnet.synsets(word) lemmas = list( chain.from_iterable([word.lemma_names() for word in word_syn])) lemmas = [ lemma for lemma in lemmas if textstat.syllable_count(lemma) <= 3 ] verbose_words.append(word) synonyms.append(lemmas) #creating dataframe with data df_verbose = pd.DataFrame({ 'Word': verbose_words, 'Synonyms': synonyms }, columns=['Word', 'Synonyms']) df_verbose.sort_values('Word', inplace=True) df_verbose.drop_duplicates(subset='Word', keep='first', inplace=True) return df_verbose
def count_fry_readability(data_list): sentence_numbers = 0 syllables_numbers = 0 # computation words_count = 0 for sentence in data_list: sentence_numbers = sentence_numbers + 1 syllables_numbers = syllables_numbers + textstat.syllable_count(sentence) words_count = words_count + textstat.lexicon_count(sentence) if words_count >= 150: break # computation words_count = 0 for sentence in reversed(data_list): sentence_numbers = sentence_numbers + 1 syllables_numbers = syllables_numbers + textstat.syllable_count(sentence) words_count = words_count + textstat.lexicon_count(sentence) if words_count >= 150: break avg_sentence_numbers = round(sentence_numbers / 3) avg_syllables_numbers = round(syllables_numbers / 3) return get_value_from_fry_graph(avg_sentence_numbers, avg_syllables_numbers)
def stat(data, data_word, data_sent): """Computes basic overview metrics and returns list of values""" #basic counts sent = len(data_sent) syll = textstat.syllable_count(data) word = len(data_word) #average calcs avg_syll = syll / word avg_word = word / sent read_time = word / 265 #advance stat flesch_kincaid_grade = fkg(int(word), int(sent), int(syll)) verbose = len( [word for word in data_word if textstat.syllable_count(word) > 3]) wordy = 0 for item in data_sent: token = word_tokenize(item) if len(token) > 40: wordy += 1 #writing to list stats = [ syll, word, sent, avg_syll, avg_word, read_time, flesch_kincaid_grade, verbose, wordy ] return stats
def syllable_table(): lst_1 = [ textstat.syllable_count(text) for text in insincere_questions.tolist() ] lst_2 = [ textstat.syllable_count(text) for text in sincere_questions.tolist() ] table = build_table(lst_1, lst_2) py.plot(table, filename='syllable_table')
def build_syllable_dct(lst): dct = {} for t in lst: if textstat.syllable_count(t) in dct: dct[textstat.syllable_count( t)] = dct[textstat.syllable_count(t)] + 1 else: dct[textstat.syllable_count(t)] = 1 for key in dct.keys(): dct[key] = dct[key] / len(lst) sorted_tuple = sorted(dct.items(), key=operator.itemgetter(0)) return sorted_tuple
def other_features_(tweet): """This function takes a string and returns a list of features. These include Sentiment scores, Text and Readability scores, as well as Twitter specific features. This is modified to only include those features in the final model.""" sentiment = sentiment_analyzer.polarity_scores(tweet) words = preprocess(tweet) #Get text only syllables = textstat.syllable_count(words) #count syllables in words num_chars = sum(len(w) for w in words) #num chars in words num_chars_total = len(tweet) num_terms = len(tweet.split()) num_words = len(words.split()) avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4) num_unique_terms = len(set(words.split())) ###Modified FK grade, where avg words per sentence is just num words/1 FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1) ##Modified FRE score, where sentence fixed to 1 FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2) twitter_objs = count_twitter_objs(tweet) #Count #, @, and http:// features = [FKRA, FRE, syllables, num_chars, num_chars_total, num_terms, num_words, num_unique_terms, sentiment['compound'], twitter_objs[2], twitter_objs[1],] #features = pandas.DataFrame(features) return features
def textstat_stats(text): doc_length = len(text.split()) flesch_ease = ts.flesch_reading_ease(text) #Flesch Reading Ease Score flesch_grade = ts.flesch_kincaid_grade(text) #Flesch-Kincaid Grade Level gfog = ts.gunning_fog(text) # FOG index, also indicates grade level # smog = ts.smog_index(text) # SMOG index, also indicates grade level, only useful on 30+ sentences auto_readability = ts.automated_readability_index(text) #approximates the grade level needed to comprehend the text. cl_index = ts.coleman_liau_index(text) #grade level of the text using the Coleman-Liau Formula. lw_formula = ts.linsear_write_formula(text) #grade level using the Linsear Write Formula. dcr_score = ts.dale_chall_readability_score(text) #uses a lookup table of the most commonly used 3000 English words # text_standard = ts.text_standard(text, float_output=False) # summary of all the grade level functions syll_count = ts.syllable_count(text, lang='en_US') syll_count_scaled = syll_count / doc_length lex_count = ts.lexicon_count(text, removepunct=True) lex_count_scaled = lex_count / doc_length idx = ['flesch_ease', 'flesch_grade','gfog', 'auto_readability','cl_index','lw_formula', 'dcr_score', # 'text_standard', 'syll_count', 'lex_count'] return pd.Series([flesch_ease, flesch_grade, gfog, auto_readability, cl_index, lw_formula, dcr_score, # text_standard, syll_count_scaled, lex_count_scaled], index = idx)
def feature_getter(text): try: text=text.decode('utf-8') except: pass text1=re.sub(r'[^\x00-\x7F]+',' ', text) ##text1=re.sub('\n','. ', text) text=text1 features=[] tokens=[] sentences = nltk.sent_tokenize(text) [tokens.extend(nltk.word_tokenize(sentence)) for sentence in sentences] syllable_count = textstat.syllable_count(text, lang='en_US') word_count = textstat.lexicon_count(text, removepunct=True) flesch = textstat.flesch_reading_ease(text) readability = textstat.automated_readability_index(text) features.append(len(sentences)) #num_sentences features.append(syllable_count) #num_sentences features.append(word_count) #num_sentences features.append(flesch) #num_sentences features.append(readability) #num_sentences return features
def fkg_over_text(data_sent): """Returns two lists of x and y points for an fkg graph""" if len(data_sent) >= 200: step = 40 else: step = int(len(data_sent) / 10) y = [] temp_fkg = [] for count, sent in enumerate(data_sent, 1): temp_fkg.append(sent) if count >= step: words = [ word for sent in temp_fkg for word in nltk.word_tokenize(sent) ] words = [word.lower() for word in words if word.isalpha()] word = len(words) syll = sum([textstat.syllable_count(word) for word in words]) y.append(fkg(word, step, syll)) temp_fkg = temp_fkg[1:] x = range(step, len(y) + step) return x, y
def most_wordy(data_sent): """Finds long sentences and outputs a dataframe with values.""" #initialize lists sylls = [] words = [] sents = [] fkgs = [] #looping through sentences to find lengthy sentences for sent in data_sent: token = word_tokenize(sent) word = len(token) if word > 40: #appending to lists syll = textstat.syllable_count(sent) sylls.append(syll) words.append(word) sents.append(sent) fkgs.append(fkg(int(word), 1, int(syll))) #transfer information to dataframe df_wordy = pd.DataFrame( { 'Words': words, 'Syllables': sylls, 'Flesch Kincaid Grade Level': fkgs, 'Sentence': sents }, columns=[ "Words", "Syllables", "Flesch Kincaid Grade Level", "Sentence" ]) df_wordy.sort_values("Words", ascending=False, inplace=True) return df_wordy
def flesch_kincaid(row): text = row['reviewText'] words = max(1, textstat.lexicon_count(text)) sentences = max(1, sentence_count(row)) syllables = textstat.syllable_count(text, lang='en_US') score = 206.835 - 1.015 * (float(words) / sentences) - 84.6 * (float(syllables) / words) return score
def syllableCount(word): # Check the Moby project first because robots are bad at english. # If it's not in the dictionary, ask Zoltar. if (masterSyllables.get(word)): return masterSyllables.get(word) elif (masterSyllables.get(word.lower())): return masterSyllables.get(word.lower()) else: return textstat.syllable_count(word)
def get_raw_stats(book, text): return { 'total_words': textstat.lexicon_count(text), 'total_sentences': len(sent_tokenize(text)), 'total_letters': textstat.letter_count(text), 'total_syllables': textstat.syllable_count(text), # 'total_paragraphs': len(get_paragraphs(text)), # 'average_word_difficulty': get_average_frequency(book) }
def do_datas(): # logging.info('do_datas') ########### Save text statistics ##### 1. nw 2. nvocab 3. nsyllable 4.nsentence 5. tone 6. readability ## 1. nw nw.append(len(words)) ## 2. nvocab nvocab.append(len(vocab)) ## 3. syllable n = textstat.syllable_count(contents) nsyllable.append(n) ## 4. sentence n = textstat.sentence_count(contents) nsentence.append(n) ## 5. tone ### LM dictionary n_neg_lm.append(count_occurrence(words, lm_neg)) n_pos_lm.append(count_occurrence(words, lm_pos)) n_uctt_lm.append(count_occurrence(words, lm_uctt)) n_lit_lm.append(count_occurrence(words, lm_lit)) n_cstr_lm.append(count_occurrence(words, lm_cstr)) n_modal1_lm.append(count_occurrence(words, lm_modal1)) n_modal2_lm.append(count_occurrence(words, lm_modal2)) n_modal3_lm.append(count_occurrence(words, lm_modal3)) n_negation_lm.append(count_negation(words, lm_pos, gt_negation)) ### General Inquirer dictionary n_neg_gi.append(count_occurrence(words, gi_neg)) n_pos_gi.append(count_occurrence(words, gi_pos)) n_negation_gi.append(count_negation(words, gi_pos, gt_negation)) ### Henry dictionary n_neg_hr.append(count_occurrence(words, hr_neg)) n_pos_hr.append(count_occurrence(words, hr_pos)) n_negation_hr.append(count_negation(words, gi_pos, gt_negation)) ## 4. readability fre_i = textstat.flesch_reading_ease(contents) if fre_i > 100: fre_i = 100 if fre_i < 0: fre_i = float('NaN') fre.append(fre_i) fkg_i = textstat.flesch_kincaid_grade(contents) if fkg_i < 0: fkg_i = float('NaN') fkg.append(fkg_i) # RIX cl_i = textstat.coleman_liau_index(contents) if cl_i < 0: cl_i = float('NaN') cl.append(cl_i) f = textstat.gunning_fog(contents) fog.append(f) f = textstat.automated_readability_index(contents) ari.append(f) f = textstat.smog_index(contents) smog.append(f)
def compute_readability_stats(text): """ Compute reading statistics of the given text Reference: https://github.com/shivam5992/textstat Parameters ========== text: str, input section or abstract text """ try: readability_dict = { 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'smog': textstat.smog_index(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'dale_chall': textstat.dale_chall_readability_score(text), 'difficult_words': textstat.difficult_words(text), 'linsear_write': textstat.linsear_write_formula(text), 'gunning_fog': textstat.gunning_fog(text), 'text_standard': textstat.text_standard(text), 'n_syllable': textstat.syllable_count(text), 'avg_letter_per_word': textstat.avg_letter_per_word(text), 'avg_sentence_length': textstat.avg_sentence_length(text) } except: readability_dict = { 'flesch_reading_ease': None, 'smog': None, 'flesch_kincaid_grade': None, 'coleman_liau_index': None, 'automated_readability_index': None, 'dale_chall': None, 'difficult_words': None, 'linsear_write': None, 'gunning_fog': None, 'text_standard': None, 'n_syllable': None, 'avg_letter_per_word': None, 'avg_sentence_length': None } return readability_dict
def _add_lemma(self, lm): tlid = self.lemma_name.setdefault(lm, self.lid) # sid = max(self.lid, tlid+1) if tlid == self.lid: self.lemma.append((tlid, lm, textstat.syllable_count(lm))) #, False)) - isCommon self.lid += 1 # if lemma not already visited, visit all of its synset for ss in wn.synsets(lm): tssid = self._add_synset(ss) self.means.add((tlid, tssid)) return tlid
def sentence_fit(gen_text, orig_text): df = pd.DataFrame( gen_text, columns=['generated']) # Text generated from GPT2 stored in dataframe df['generated'] = df['generated'].str.replace(r' +,', ',').str.replace( r' +\.', '.') # Remove spaces in front of punctuation df['similarity'] = df['generated'].apply(lambda x: text_similarity( orig_text, x)) # Assess cosine similarity betweeen sentences df['n_syll'] = df['generated'].apply( textstat.syllable_count) # Count number of syllables df['n_lex'] = df['generated'].apply( textstat.lexicon_count) # Count number of words df['syll_lex'] = df['n_syll'] / df['n_lex'] # Syllable to word ratio # Flags to indicate whether generated text has fewer words, syallables, or syll to word ratio df['rel_syll'] = np.where( df['n_syll'] < textstat.syllable_count(orig_text), 1, 0) df['rel_lex'] = np.where(df['n_lex'] < textstat.lexicon_count(orig_text), 1, 0) df['rel_rat'] = np.where( df['syll_lex'] < textstat.syllable_count(orig_text) / textstat.lexicon_count(orig_text), 1, 0) # Sum binary indicators of relative sentence simplicity df['rel_simp'] = (df['rel_syll'] + df['rel_lex'] + df['rel_rat']) / 3 # Fit score is weighted sum of similarity and relative sentence simplicity # Highest score will be chosen df['fit_score'] = 0.7 * df['similarity'] + 0.3 * df['rel_simp'] # Subset data and rename columns df['Original'] = orig_text df = df[['Original', 'generated', 'similarity', 'rel_simp', 'fit_score']] df.columns = [ 'Original', 'Generated', 'Similarity', 'Simplicity', 'Fit Score' ] return df
def getSyllableCount(word): """ A function to count syllables :param word: word whose syllables will be counted :return: number of syllables in the word """ words = word.split(" ") count = 0 for eachWord in words: count += textstat.syllable_count(eachWord) return count
def count_sentences_syllables(data_list): sentences_30_syllables = [] sentences_20_syllables = [] sentences_30_count = 0 sentences_20_count = 0 for sentence in data_list: count = sum([textstat.syllable_count(word) for word in sentence.split()]) if count > 30: sentences_30_count += 1 sentences_30_syllables.append((sentences_30_count, sentence)) if count > 20 and count < 30: sentences_20_count += 1 sentences_20_syllables.append((sentences_20_count, sentence)) return sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count
def isValuableComment(clean_text): if clean_text != clean_text: return False print("clean text:", clean_text) if (len(clean_text) <= 10): return False readability_score = textstat.flesch_reading_ease(clean_text) syllable_count = textstat.syllable_count(clean_text) # Score of 80-90 == easy (from 100 (very easy to hard)) # if (readability_score > 85): # return False # print(readability_score) return True
def lisibilty(text): f_lis = ([ textstat.syllable_count(str(text), lang='en_arabic'), textstat.lexicon_count(str(text), removepunct=True), textstat.sentence_count(str(text)), textstat.flesch_reading_ease(str(text)), textstat.flesch_kincaid_grade(str(text)), textstat.gunning_fog(str(text)), textstat.smog_index(str(text)), textstat.automated_readability_index(str(text)), textstat.coleman_liau_index(str(text)), textstat.linsear_write_formula(str(text)), textstat.dale_chall_readability_score(str(text)) ]) return f_lis
def get_word_stats(word): global sentiment_analyzer if sentiment_analyzer is None: sentiment_analyzer = SentimentIntensityAnalyzer() count_syllables = textstat.syllable_count(word) freq_score = zipf_frequency(word, "en") polarity = sentiment_analyzer.polarity_scores(word) stats = { "syllables": count_syllables, "freq_score": freq_score, "sentiment": 1 if polarity["pos"] else -1 if polarity["neg"] else 0, "sentiment_degree": polarity["compound"], "difficulty": (min(count_syllables, 6) * 5 // (1 + min(freq_score, 6))) } return stats
def syll_over_text(data_word): """Returns two lists of x and y points for a syllable per word graph""" step = 200 y = [] temp_syll = [] for count, word in enumerate(data_word, 1): temp_syll.append(textstat.syllable_count(word)) if count >= step: y.append(sum(temp_syll) / len(temp_syll)) temp_syll = temp_syll[1:] x = range(step, len(y) + step) return x, y
def words_sentence_syllables(data_list): words_12_letters = [] words_4_syllables = [] words_12_count = 0 words_4_count = 0 for sentence in data_list: for word in sentence.split(): count4 = textstat.syllable_count(word) count12 = textstat.letter_count(word) if count12 > 12: words_12_count += 1 words_12_letters.append((count12, word)) if count4 > 4: words_4_count += 1 words_4_syllables.append((count4, word)) return words_12_letters, words_12_count, words_4_syllables, words_4_count
def forcast(doc): """ :param: doc object :returns: tuple with grade level, age level """ word_tokens = doc.word_tokens monosyllables = 0 for i in word_tokens: if i.isalpha() == False and len(i) < 2: word_tokens.remove(i) for i in word_tokens[10:159]: if syllable_count(i) < 2: monosyllables += 1 gl = 20 - (monosyllables/10) ra = 25 - (monosyllables/10) return (gl, ra, monosyllables)
def get_readability_features(self): sent_tokens = text_tokenizer(self.raw_text, replace_url_flag=True, tokenize_sent_flag=True) sentences = [' '.join(sent) + '\n' for sent in sent_tokens] sentences = ''.join(sentences) self.syllable_count = textstat.syllable_count(sentences) self.flesch_reading_ease = textstat.flesch_reading_ease(sentences) self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(sentences) self.fog_scale = textstat.gunning_fog(sentences) self.smog = textstat.smog_index(sentences) self.automated_readability = textstat.automated_readability_index( sentences) self.coleman_liau = textstat.coleman_liau_index(sentences) self.linsear_write = textstat.linsear_write_formula(sentences) self.dale_chall_readability = textstat.dale_chall_readability_score( sentences) self.text_standard = textstat.text_standard(sentences)
def get_desc_data(string): ''' Input: book description string Output: returns desc_len, num_unique_words, avg_word_len ''' #Data before text processing desc_semantic = get_semantic(string) syl_count = syllable_count(string) lex_count = lexicon_count(string) sent_count = sentence_count(string) flesch = flesch_reading_ease(string) #Data after text processing string = text_preprocess(string) word_cnt = word_count(string) description_len = desc_len(string) number_unique_words = num_unique_words(string) average_word_len = avg_word_len(string) return desc_semantic, word_cnt, description_len, number_unique_words, \ average_word_len, syl_count, lex_count, sent_count, flesch
def test_syllable_count(self): count = textstat.syllable_count(self.long_test) self.assertEqual(521, count)
def test_syllable_count(): count = textstat.syllable_count(long_test) assert count == 521
PDM Bugs Workflow If a ticket Summary, Description, Steps to Reproduce or Expected Result are not clear enough for you, use the Feedback status and assign it back to the previous assignee. Be sure to leave a comment describing what you require to proceed. If you can't proceed with the resolution of the ticket because of any other limitations, set it to Blocked status and give your reasons for doing so in the Comments. Also, remember to change Assignee to whoever you think is responsible for unblocking the issue or the Project Manager if you don't know who that might be. If you decide to do so, you will be able to track the time tickets spend in Feedback and Blocked statuses here. You can set a custom timespan, desired ticket status, and project to see how you're progressing. Company Glossary - a concept, TBD with Docs Team Ever had the impression during a conversation that both of you are talking about the same thing but you name it differently? That's common in growing businesses like our company. For example, definition of a device varies, or what's a view to someone is a section to someone else. Or my favourite: some people even refer to bugs as features. The list is very long. Many times the differences stem from the Project customization and it's not possible to have alerts when the client requested faults. But some of them can be unified. Enter the Docs Team! Our Technical Writers are working on a glossary of business, technical and other terms used in our company so that we make sure we're all on the same page in terms of vocabulary. """ print(textstat.syllable_count(text)) print( f"The Flesch Reading Ease score is: {textstat.flesch_reading_ease(text)}") print( f"The Flesch-Kincaid Grade level is: {textstat.flesch_kincaid_grade(text)}" ) print( f"The Dale-Chall Readability Score is: {textstat.dale_chall_readability_score(text)}" ) print( f"The readability consensus is: {textstat.text_standard(text, float_output=False)}" )
def test_syllable_count(): textstat.set_lang("en_US") count = textstat.syllable_count(long_test) assert count == 521