def syllables_count(word): """ Textstat is a python package, to calculate statistics from text to determine readability complexity and grade level of a particular corpus. Package can be found at https://pypi.python.org/pypi/textstat """ return textstatistics().syllable_count(word)
def calcFeatures(params): index, rev = params # Multiprocessing... global rev_xl filename = "insert data path of the 2015 data from https://figshare.com/articles/English_Wikipedia_Quality_Asssessment_Dataset/1375406" + str( rev['revid']) if (os.path.exists(filename)): print(rev['revid']) text = util.read_file(filename) text = util.cleanhtml(text) text = text.replace('\'\'\'', '') assert rev['pageid'] == rev_xl.iloc[index, 0] print("matched ", rev['revid']) calc = readcalc.ReadCalc(text) textual_score = list(calc.get_all_metrics()) text_stat = textstatistics() linsear_write_formula = round(text_stat.linsear_write_formula(text),2) textual_score.append(linsear_write_formula) grammar_score = len(tool.check(text)) textual_score.append(grammar_score) rev_xl.iloc[index, 14:36] = textual_score print(rev_xl.iloc[index, :]) if index % 10 == 0: rev_xl.to_csv(path)
def syllables(text): """ :param text: :return: """ num_of_syllables = 0 stripped = text.replace("[", "").replace("]", "").replace("'", "") splits = stripped.split(',') for word in splits: num_of_syllables += textstatistics().syllable_count(word) return num_of_syllables
def flesch_reading_ease(text): #Reading Ease score = 206.835 - (1.015 × average sentence length) - (84.6 × average word length in syllables) words_count,sentences_count,_,_ = get_param(text) #calculate average sentence length avg_sentence_length = float(words_count/sentences_count) syllable_count = textstatistics().syllable_count(text) #calculate average syllables per word avg_syllables_per_word = float(syllable_count) / float(words_count) FRE = 206.835 - float(1.015 * avg_sentence_length) - float(84.6 * avg_syllables_per_word) return legacy_round(FRE, 2)
def complex_words(text): words = [] sentences = break_sentences(text) for sentence in sentences: words += [token for token in sentence] diff_words_set = set() for word in words: if word not in easy_word_set and textstatistics().syllable_count(str(word)) >= 2: diff_words_set.add(word) return len(diff_words_set)
def difficulty(text): """ :param text: :return: """ difficulties = [] stripped = text.replace("[", "").replace("]", "").replace("'", "") splits = stripped.split(',') for word in splits: if word not in easy_word_set and textstatistics().syllable_count(word) > 2: difficulties.append(word) return difficulties
def poly_syllable_count(text): """ :param text: :return: """ count = 0 stripped = text.replace("[", "").replace("]", "").replace("'", "") splits = stripped.split(',') for word in splits: syllable_count = textstatistics().syllable_count(word) if syllable_count >= 3: count += 1 return count
def c_score(self, text): nlp = spacy.load('en') doc = nlp(text) sentences = [sent for sent in doc.sents] words = 0 for sentence in sentences: words += len([token for token in sentence]) num_sent = len(sentences) sent_len = float(words / num_sent) sylls = textstatistics().syllable_count(text) ASPW = float(sylls) / float(words) syls_p_wd = legacy_round(ASPW, 1) FRE = 206.835 - float(1.015 * sent_len) - float(84.6 * syls_p_wd) score = legacy_round(FRE, 2) return words, score
def text_analysis(text): #Use spacy lib for tokenization nlp = spacy.load('en_core_web_sm') doc = nlp(text) sentences = doc.sents #create 3 counters wordsNo = 0 sentencesNo = 0 chars = 0 words = [] for sentence in sentences: #Count all sentences sentencesNo += 1 for token in sentence: if token.dep_!='punct': #Count all words without punctuation wordsNo += 1 #create a list of words words.append(str(token)) if sentencesNo>0: #calculate average sentence length average_sentence_length = float(wordsNo / sentencesNo) else: average_sentence_length = 0 #get number of syllables in the text syllable_count = textstatistics().syllable_count(text) #calculate the number of characters for word in words: chars += len(word) if wordsNo>0: #calculate average word length average_word_length = float (chars / wordsNo) else: average_word_length = 0 return wordsNo, sentencesNo, average_sentence_length, syllable_count, average_word_length
def get_param(text): #Use spacy lib for tokenization nlp = spacy.load('en') doc = nlp(text) sentences = doc.sents #create 3 counters wordsNo = 0 sentencesNo = 0 poly_syllable_count = 0 #Create an empty list for words words = [] for sentence in sentences: #Count all words wordsNo += len([token for token in sentence]) #Count all sentences sentencesNo += 1 #create a list of words words += [str(token) for token in sentence] #Create a difficult words set to contain difficult words diff_words = set() #Load easy word set easy_word = set([ ln.strip() for ln in pkg_resources.resource_stream( 'textstat', '/resources/en/easy_words.txt') ]) #Loop on all words for word in words: #Get syllable count syllable_count = textstatistics().syllable_count(word) #poly_syllable_count is when syllable is grater than three per word if syllable_count >= 3: poly_syllable_count += 1 #Difficult word when the word is not in easy word set and contain more than 2 syllables if word not in easy_word and syllable_count >= 2: diff_words.add(word) #Analyse text to return important parameters for all readability fomulas return wordsNo, sentencesNo, len(diff_words), poly_syllable_count
def flesch_kincaid_grade(word): return textstatistics().flesch_kincaid_grade(word)
def linsear_write_score(text): return textstatistics().linsear_write_formula(text)
def flesch_grade_score(text): return textstatistics().flesch_kincaid_grade(text)
def __init__(self): """ An local Object of the textstat package is created before using analyse function """ super().__init__() self.text_statistics = TS.textstatistics()
def __init__(self): """ Creating textstat object for analyse function """ self.text_statistics = TS.textstatistics()
def num_syllables(word): return textstatistics().syllable_count(word)
def __init__(self): super().__init__() self.ts = TS.textstatistics()
def dale_chall_readability_score(word): return textstatistics().dale_chall_readability_score(word)
def lexicon_count(word): return textstatistics().lexicon_count(word)
def flesch_ease_score(text): return textstatistics().flesch_reading_ease(text)
def coleman_liau_index(word): return textstatistics().coleman_liau_index(word)
def syllables_count(word): return textstatistics().syllable_count(word, lang='en_US')
# Calculate the total number of sentences docReader = nltk.corpus.PlaintextCorpusReader('./', artist + '.txt') sentences = len(docReader.sents()) # Calculate the total number of difficult words diff_words_count = textstat.difficult_words(raw_text) # Calculate readability-- Gunning Fog dif_words = (diff_words_count / ttl_words * 100) gf_read = 0.4 * (float(ttl_words / sentences) + dif_words) # Calculate readability-- SMOG poly_syl = 0 for word in words: syl_count = textstatistics().syllable_count(word) if syl_count >= 3: poly_syl += 1 SMOG = (1.043 * (30 * (poly_syl / sentences))**0.5) + 3.1291 smog_read = legacy_round(SMOG, 1) # Calculate readability-- Linsear Write cl_read = textstat.coleman_liau_index(raw_text) df.loc[i] = (artist, 0, ttl_words, sentences, 0, len(set(words)), round(100 - (len(lyrics_no_sw) * 100.0 / ttl_words), 2), diff_words_count, gf_read, smog_read, cl_read) i += 1 df['songs'] = [304, 224] df['words_per_song'] = df['words'] / df['songs']
def linsear_write_formula(word): return textstatistics().linsear_write_formula(word)
def text_param(text): #Use spacy lib for tokenization nlp = spacy.load('en_core_web_sm') doc = nlp(text) sentences = doc.sents #create 5 counters wordsNo = 0 sentencesNo = 0 poly_syllable_count = 0 long_word = 0 chars = 0 #Create an empty list for words words = [] for sentence in sentences: #Count all sentences sentencesNo += 1 for token in sentence: #Count all words wordsNo += 1 words.append(str(token)) # print(str(token)) #Create a difficult words set to contain difficult words diff_words = set() #Load easy word set easy_word = set([ln.strip() for ln in pkg_resources.resource_stream('textstat', '/resources/en/easy_words.txt')]) #Load easy word set for word in words: #Get syllable count of word syllable_count = textstatistics().syllable_count(word) #poly_syllable_count is when syllable is grater than three per word if syllable_count >= 3: poly_syllable_count += 1 #Long word is when its length is greater than 7 if len(word)>7: long_word += 1 #Count no of characters chars += len(word) #Get syllable count of whole text syllable_count = textstatistics().syllable_count(text) #Get lexical count of whole text lexical_counts = textstat.lexicon_count(text, removepunct=True) #calculate average sentence length average_sentence_length = float(wordsNo / sentencesNo) #calculate average syllables per words average_syllables_per_words = float(syllable_count / wordsNo) #calculate average poly syllable per words average_poly_syllable = float (poly_syllable_count / wordsNo) #calculate average long words per words average_long_word = float (long_word / wordsNo) #calculate average word length average_word_length = float (chars / wordsNo) #return a list with text parameters used in readability equations as features return [ wordsNo , sentencesNo, average_sentence_length , syllable_count ,\ average_syllables_per_words , poly_syllable_count , lexical_counts ,\ average_poly_syllable , long_word , average_long_word , average_word_length ]
def syllables_count(dummy): return textstatistics().syllable_count(dummy)
def load(): TextStats.ts = textstatistics() TextStats.es = easy_word_set
def smog_index(word): return textstatistics().smog_index(word)
def syllables_count(word): return textstatistics().syllable_count(word)
def automated_readability_index(word): return textstatistics().automated_readability_index(word)