예제 #1
0
 def do_text_stats(self, text):
     ### Syllable Count
     syllable_count = textstat.syllable_count(text)
     ### Lexicon Count
     lexicon_count = textstat.lexicon_count(text, True)
     ### Sentence Count
     sentence_count = textstat.sentence_count(text)
     ### The Flesch Reading Ease formula
     try:
         flesch_reading_ease = textstat.flesch_reading_ease(text)
     except TypeError as e:
         flesch_reading_ease = None
     #* 90-100 : Very Easy
     #* 80-89 : Easy
     #* 70-79 : Fairly Easy
     #* 60-69 : Standard
     #* 50-59 : Fairly Difficult
     #* 30-49 : Difficult
     #* 0-29 : Very Confusing
     ### The The Flesch-Kincaid Grade Level
     try:
         flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     except TypeError as e:
         flesch_kincaid_grade = None
     ## The Fog Scale (Gunning FOG Formula)
     gunning_fog = textstat.gunning_fog(text)
     ### The SMOG Index
     smog_index = textstat.smog_index(text)
     ### Automated Readability Index
     automated_readability_index = textstat.automated_readability_index(
         text)
     ### The Coleman-Liau Index
     try:
         coleman_liau_index = textstat.coleman_liau_index(text)
     except TypeError as e:
         coleman_liau_index = None
     ### Linsear Write Formula
     linsear_write_formula = textstat.linsear_write_formula(text)
     ### Dale-Chall Readability Score
     dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     ### Readability Consensus based upon all the above tests
     try:
         text_standard = textstat.text_standard(text)
     except TypeError as e:
         text_standard = None
     return {
         "syllable_count": syllable_count,
         "lexicon_count": lexicon_count,
         "sentence_count": sentence_count,
         "flesch_reading_ease": flesch_reading_ease,
         "flesch_kincaid_grade": flesch_kincaid_grade,
         "gunning_fog": gunning_fog,
         "smog_index": smog_index,
         "automated_readability_index": automated_readability_index,
         "coleman_liau_index": coleman_liau_index,
         "linsear_write_formula": linsear_write_formula,
         "dale_chall_readability_score": dale_chall_readability_score,
         "text_standard": text_standard
     }
예제 #2
0
def readability(text):
    print("Readability\n=================================\n\n")
    print("Flesch Reading Ease\n________________________\n\n")
    print str(textstat.flesch_reading_ease(text)) + "\n"
    print("Smog Index\n________________________\n\n")
    print str(textstat.smog_index(text)) + "\n"
    print("Flesch Kincaid Grade\n________________________\n\n")
    print str(textstat.flesch_kincaid_grade(text)) + "\n"
    print("Coleman Liau Index\n________________________\n\n")
    print str(textstat.coleman_liau_index(text)) + "\n"
    print("ARI\n________________________\n\n")
    print str(textstat.automated_readability_index(text)) + "\n"
    print("Dale Chall\n________________________\n\n")
    print str(textstat.dale_chall_readability_score(text)) + "\n"
    print("Difficult Words\n________________________\n\n")
    print str(textstat.difficult_words(text)) + "\n"
    print("Linsear Write Formula\n________________________\n\n")
    print str(textstat.linsear_write_formula(text)) + "\n"
    print("Gunning Fog\n________________________\n\n")
    print str(textstat.gunning_fog(text)) + "\n"
    print "Compiled Score\n_____________________________\n\n"
    print str(textstat.text_standard(text)) + "\n"


    return len(adjectives)
예제 #3
0
def get_readability(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i, col in enumerate(text_feats):
        df['flesch_reading_ease{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_reading_ease(x))
        df['smog_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.smog_index(x))
        df['flesch_kincaid_grade{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_kincaid_grade(x))
        df['coleman_liau_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.coleman_liau_index(x))
        df['automated_readability_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.automated_readability_index(x))
        df['dale_chall_readability_score{}'.format(i)] = df[col].apply(
            lambda x: textstat.dale_chall_readability_score(x))
        df['difficult_words{}'.format(i)] = df[col].apply(
            lambda x: textstat.difficult_words(x))
        df['linsear_write_formula{}'.format(i)] = df[col].apply(
            lambda x: textstat.linsear_write_formula(x))
        df['gunning_fog{}'.format(i)] = df[col].apply(
            lambda x: textstat.gunning_fog(x))
        df['text_standard{}'.format(i)] = df[col].apply(
            lambda x: textstat.text_standard(x))
    return df
예제 #4
0
 def get_description_composite_grade_level(self):
     """Calculates the grade level of the repository's
     description using a variety of measures in textstat.
     """
     if self.description:
         return textstat.text_standard(self.description)
     else:
         return None
예제 #5
0
 def get_readme_composite_grade_level(self):
     """Calculates the grade level of the repository's
     readme using a variety of measures in textstat.
     """
     if self.readme:
         return textstat.text_standard(self.readme)
     else:
         return None
예제 #6
0
 def get_description_composite_grade_level(self):
     """Calculates the grade level of the repository's
     description using a variety of measures in textstat.
     """
     if self.description:
         return textstat.text_standard(self.description)
     else:
         return None
예제 #7
0
 def get_readme_composite_grade_level(self):
     """Calculates the grade level of the repository's
     readme using a variety of measures in textstat.
     """
     if self.readme:
         return textstat.text_standard(self.readme)
     else:
         return None
예제 #8
0
파일: level.py 프로젝트: ebemunk/blog
def main():
    conn = psycopg2.connect(**db.config())
    read = conn.cursor()
    read.execute('select season, episode, string_agg(text, \' \') from scene_text group by season, episode order by season, episode')
    row = read.fetchone()

    while row is not None:
        text = row[2]
        text_standard = textstat.text_standard(text, True)
        print(row[0], row[1], text_standard)
        row = read.fetchone()
예제 #9
0
def complexityAlongtheText(text: str,
                           chunk_length: int = 5) -> Union[float, float, str]:
    words = sent_tokenize(text)
    cur = 0
    stds = []
    hardest_chunk_index = 0
    while cur < len(words):
        sub = words[cur:cur + 5]
        sub_text = " ".join(sub)
        std = textstat.text_standard(sub_text, float_output=True)
        cur += chunk_length
        if std > hardest_chunk_index:
            hardest_chunk_index = cur
        stds.append(std)
    hard_snippet = words[hardest_chunk_index:hardest_chunk_index +
                         chunk_length]
    hs = ""
    for h in hard_snippet:
        hs += h + str(" ")
    return np.mean(stds), textstat.text_standard(text, float_output=True), hs
예제 #10
0
def main():
    conn = psycopg2.connect(**db.config())
    read = conn.cursor()
    read.execute('select * from all_lines_by_char where char_name in (select char_name from total_lines_by_char limit 25)')
    row = read.fetchone()

    while row is not None:
        text = row[1]
        text_standard = textstat.text_standard(text, True)
        print(row[0], text_standard)
        row = read.fetchone()
예제 #11
0
def _is_readable(phrase: str) -> bool:
    """
    Checks if a given phrase is readable
    :param phrase: The string to check
    :return: True if readable, false if not
    """
    textstat.set_lang("en")
    score = textstat.text_standard(phrase, float_output=True)
    if score > 6:
        return True
    else:
        return False
예제 #12
0
def get_stats(sentence):
	syllables = textstat.syllable_count(sentence)
	words = textstat.lexicon_count(sentence, True)
	sentence_count = textstat.sentence_count(sentence)

	if sentence_count > 0:
		text_standard = textstat.text_standard(sentence)
	else:
		text_standard = EMPTY_TEXT_STANDARD

	text_standard = fix_grammar_errors(text_standard)

	return combine(syllables, words, sentence_count, text_standard)
예제 #13
0
def main():
    conn = psycopg2.connect(**db.config())
    read = conn.cursor()
    read.execute(
        'select season, episode, string_agg(text, \' \') from scene_text group by season, episode order by season, episode'
    )
    row = read.fetchone()

    while row is not None:
        text = row[2]
        text_standard = textstat.text_standard(text, True)
        print(row[0], row[1], text_standard)
        row = read.fetchone()
예제 #14
0
def get_readability(contents):
    readability = []
    readability.append(textstat.flesch_reading_ease(contents))
    readability.append(textstat.smog_index(contents))
    readability.append(textstat.flesch_kincaid_grade(contents))
    readability.append(textstat.automated_readability_index(contents))
    readability.append(textstat.dale_chall_readability_score(contents))
    readability.append(textstat.difficult_words(contents))
    readability.append(textstat.linsear_write_formula(contents))
    readability.append(textstat.gunning_fog(contents))
    readability.append(textstat.coleman_liau_index(contents))
    readability.append(textstat.text_standard(contents))

    return readability
예제 #15
0
def analyse_json(json_text):
    # consider moving this to be a feature of Transcript in the other module

    df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name',
                                         'syllable_count','lexicon_count',
                                         'sentence_count',
                                         'syllables_per_word',
                                         'gunning_fog', 'smog_index',
                                         'text_standard'],
                      index=[])

    trscrpt = json.loads(json_text)
    if 'witnesses' in trscrpt:
        witnesses = trscrpt['witnesses']


        for s in trscrpt['all_sections']:
            if 'speaker' in s and 'person' in s['speaker'] and \
                    s['speaker']['person']['speaker_type']=='witness':
                witness =  witnesses[s['speaker']['person']['name']]
                witness.setdefault('all_text', []).append(s['spoken_text'])

        for i, p in enumerate(witnesses):
            if 'all_text' in witnesses[p]:
                witness_text = '\n\n'.join(witnesses[p]['all_text'])
                if len(witness_text) > 0:
                    stats_data = {'html_file_location': trscrpt['html_file_location'],
                                  'witness_name': p,
                                  'syllable_count': textstat.syllable_count(witness_text),
                                  'lexicon_count': textstat.lexicon_count(witness_text),
                                  'sentence_count': textstat.sentence_count(witness_text),
                                  'syllables_per_word': textstat.avg_syllables_per_word(witness_text),
                                  'gunning_fog': textstat.gunning_fog(witness_text),
                                  'smog_index': textstat.smog_index(witness_text),
                                  'text_standard': textstat.text_standard(witness_text)}
                    df_witnesses.loc['witness_%i' % i] = stats_data
                else:
                    df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                    df_witnesses.loc['witness_%i' % i, 'witness_name'] = p
            else:
                df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                df_witnesses.loc['witness_%i' % i, 'witness_name'] = p

    return df_witnesses
예제 #16
0
def run_textstat(text):
    #text = """Playing games has always been thought to be important to the development of well-balanced and creative children; however, what part, if any, they should play in the lives of adults has never been researched that deeply. I believe that playing games is every bit as important for adults as for children. Not only is taking time out to play games with our children and other adults valuable to building interpersonal relationships but is also a wonderful way to release built up tension."""

    ts_flesch_reading_ease = textstat.flesch_reading_ease(text)
    ts_smog_index = textstat.smog_index(text)
    ts_flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
    ts_coleman_liau_index = textstat.coleman_liau_index(text)
    ts_automated_readability_index = textstat.automated_readability_index(text)
    ts_dale_chall_readability_score = textstat.dale_chall_readability_score(
        text)
    ts_difficult_words = textstat.difficult_words(text)
    ts_linsear_write_formula = textstat.linsear_write_formula(text)
    ts_gunning_fog = textstat.gunning_fog(text)
    ts_text_standard = textstat.text_standard(text)

    return (ts_flesch_reading_ease, ts_smog_index, ts_flesch_kincaid_grade,
            ts_coleman_liau_index, ts_automated_readability_index,
            ts_dale_chall_readability_score, ts_difficult_words,
            ts_linsear_write_formula, ts_gunning_fog, ts_text_standard)
예제 #17
0
def lambda_handler(event, context):

    text = event['text']

    response = {}
    response['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    response['smog_index'] = textstat.smog_index(text)
    response['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
    response['coleman_liau_index'] = textstat.coleman_liau_index(text)
    response[
        'automated_readability_index'] = textstat.automated_readability_index(
            text)
    response[
        'dale_chall_readability_score'] = textstat.dale_chall_readability_score(
            text)
    response['difficult_words'] = textstat.difficult_words(text)
    response['linsear_write_formula'] = textstat.linsear_write_formula(text)
    response['gunning_fog'] = textstat.gunning_fog(text)
    response['text_standard'] = textstat.text_standard(text)

    return respond(None, response)
예제 #18
0
 def stats(self, text):
     test_data = text
     stats = {}
     stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     stats['smog'] = textstat.smog_index(test_data)
     stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data)
     stats['coleman Liau'] = textstat.coleman_liau_index(test_data)
     stats['automated'] = textstat.automated_readability_index(test_data)
     stats['dale chall'] = textstat.dale_chall_readability_score(test_data)
     stats['difficult'] = textstat.difficult_words(test_data)
     stats['linsear'] = textstat.linsear_write_formula(test_data)
     stats['gunning_fog'] = textstat.gunning_fog(test_data)
     stats['standard'] = textstat.text_standard(test_data)
     stats['charcount'] = textstat.char_count(test_data)
     stats['lexicon count'] = textstat.lexicon_count(test_data)
     stats['syllable count'] = textstat.syllable_count(test_data)
     stats['sentence count'] = textstat.sentence_count(test_data)
     stats['avg sentence length'] = textstat.avg_sentence_length(test_data)
     stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word(
         test_data)
     stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data)
     stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word(
         test_data)
     return stats
예제 #19
0
#main script
if __name__ == '__main__':

	print "TextStat Comparison Script"
	print "--------------------------"
	
	#read in text from the command line
	#This needs to be fixed to deal/escape special characters
	textToCheck = raw_input("Please enter the text you would like to analyse: ") 
	
	#read in text from a file- but what format?
	
	print "\n\n"
	print "Results"
	print "=============================================="
	print "==============================================\n"
	
	print "Syllable Count: " + str(textstat.syllable_count(textToCheck))
	print "Lexicon Count: " + str(textstat.lexicon_count(textToCheck)) #TRUE is default and removes punctuation before counting
	print "Sentence Count: " + str(textstat.sentence_count(textToCheck))
	print "Flesch Reading Ease formula: " + str(textstat.flesch_reading_ease(textToCheck))
	print "Flesch-Kincaid Grade Level: " + str(textstat.flesch_kincaid_grade(textToCheck))
	print "Fog Scale (Gunning FOG Formula): " + str(textstat.gunning_fog(textToCheck))
	print "SMOG Index: " + str(textstat.smog_index(textToCheck))
	print "Automated Readability Index: " + str(textstat.automated_readability_index(textToCheck))
	print "Coleman-Liau Index: " + str(textstat.coleman_liau_index(textToCheck))
	print "Linsear Write Formula: " + str(textstat.linsear_write_formula(textToCheck))
	print "Dale-Chall Readability Score: " + str(textstat.dale_chall_readability_score(textToCheck))
	print "--------------------------------------------------------------"
	print "Readability Consensus based upon all the above tests: " + str(textstat.text_standard(textToCheck))
	print "\n\n"
예제 #20
0
        print(linsear_write_formula)
        print("The Fog Scale (Gunning FOG Formula)")
        # print(textstat.gunning_fog(test_data))
        gunning_fog = textstat.gunning_fog(test_data)
        print(gunning_fog)

        print(
            "---------------------------------Summary----------------------------------"
        )
        print("Readability Consensus based upon all the above tests")
        print(
            "Based upon all the above tests, "
            "returns the estimated school grade level required to understand the text."
        )
        # print(textstat.text_standard(test_data))
        school_grade_level = textstat.text_standard(test_data)
        print(school_grade_level)

        # l = [flesch_kincaid_grade, coleman_liau_index, automated_readability_index, linsear_write_formula, gunning_fog]
        # grade_level_avg = sum(l) / len(l)
        # print(grade_level_avg)
        print(
            "--------------------------------------------------------------------------"
        )

        print(
            "\n------------------------------------------------------------------------------"
        )
        print("Save every Thing on CSV file")

        list_row = [
예제 #21
0
    def __init__(self, path):
        """
        Create document instance for analysis.

        Opens and reads document to string raw_text.
        Textract interprets the document format and
        opens to plain text string (docx, pdf, odt, txt)

        Args:
            path (str): path to file to open, anaylze, close


        Public attributes:
        -user: (str) optional string to set username.
        -path: (str) relative path to document.
        -abs_path: (str) the absolute path to the document.
        -file_name:  (str) the file name with extension of document (base
        name).
        -mime:  tbd
        -guessed_type:  makes best guess of mimetype of document.
        -file_type:  returns index[0] from guessed_type.
        -raw_text:  (str) plain text extracted from .txt, .odt, .pdf, .docx,
        and .doc.
        -ptext:  (str) raw text after a series of regex expressions to
        eliminate special characters.
        -text_no_feed:  (str) ptext with most new line characters eliminated
        /n/n stays intact.
        -sentence_tokens:  list of all sentences in a comma separated list
        derived by nltk.
        -sentence_count:  (int) count of sentences found in list.
        -passive_sentences:  list of passive sentences identified by the
        passive module.
        -passive_sentence_count:  count of the passive_sentences list.
        -percent_passive:  (float) ratio of passive sentences to all sentences
        in percent form.
        -be_verb_analysis:  (int) sum number of occurrences of each to be verb
        (am, is, are, was, were, be, being been).
        -be_verb_count: tbd
        -be_verb_analysis: tbd
        -weak_sentences_all:  (int) sum of be verb analysis.
        -weak_sentences_set:  (set) set of all sentences identified as
        having to be verbs.
        -weak_sentences_count:  (int) count of items in weak_sentences_set.
        -weak_verbs_to_sentences:  (float) proportion of sentences with to
        be to all sentences in percent (this might not be sound).
        -word_tokens:  list of discreet words in text that breaks
        contractions up (default nltk tokenizer).
        -word_tokens_no_punct:  list of all words in text including
        contractions but otherwise no punctuation.
        -no_punct:  (str) full text string without sentence punctuation.
        -word_tokens_no_punct:  uses white-space tokenizer to create a list
        of all words.
        -readability_flesch_re:  (int) Flesch Reading Ease Score (numeric
        score) made by textstat module.
        -readability_smog_index:  (int) grade level as determined by the
        SMOG algorithum made by textstat module.
        -readability_flesch_kincaid_grade:  (int)  Flesch-Kincaid grade level
        of reader made by textstat module.
        -readability_coleman_liau_index:  (int) grade level of reader as made by
        textstat module.
        -readability_ari:  (int) grade leader of reader determined by
        automated readability index algorithum implemented by textstat.
        -readability_linser_write:  FIX SPELLING grade level as determined
        by Linsear Write algorithum implemented by textstat.
        -readability_dale_chall:  (int) grade level based on Dale-Chall
        readability as determined by textstat.
        -readability_standard:  composite grade level based on readability
        algorithums.
        -flesch_re_key:  list for interpreting Flesch RE Score.
        -word_count:  word count of document based on white space tokener,
        this word count should be used.
        -page_length:  (float) page length in decimal format given 250
        words per page.
        -paper_count:  (int) number of printed pages given 250 words per
        page.
        -parts_of_speech:  words with parts of speech tags.
        -pos_counts:  values in word, tag couple grouped in a list (Counter).
        -pos_total:  (int) sum of pos_counts values
        -pos_freq:  (dict) word, ratio of whole
        -doc_pages:  (float) page length based on 250 words per page
        (warning, this is the second time this attribute is defined).
        -freq_words:  word frequency count not standardized based on the
        correct word tokener (not ratio, just count).
        modal_dist:  count of auxillary verbs based on word_tokens_no_punct.
        sentence_count (int): Count the sentence tokens
        passive_sentences (list): List of all sentences identified as passive
        passive_sentence_count (int): count of items in passive_sentences
        be_verb_count (int): count "to be" verbs in text
        word_tokens_no_punct (list): words separated, stripped of punctuation,
         made lower case
        flesch_re_key (str): reading ease score to description
        freq_words (list or dict): frequency distribution of all words
        modal_dist (list): frequency distribution of aux verbs
        """
        self.user = ""
        self.path = path
        self.abs_path = os.path.abspath(self.path)
        if os.path.isfile(self.path):
            self.time_stamp = self.timestamp()
            self.file_name = os.path.basename(path)
            self.mime = MimeTypes()
            self.guessed_type = self.mime.guess_type(self.path)
            self.file_type = self.guessed_type[0]
            self.raw_text = textract.process(self.path, encoding="ascii")
            self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text)
            self.ptext = re.sub(u"\u2014", "--", self.ptext)
            self.ptext = re.sub(",", ",", self.ptext)
            self.ptext = re.sub("—", "--", self.ptext)
            self.ptext = re.sub("…", "...", self.ptext)
            self.text_no_feed = self.clean_new_lines(self.ptext)
            self.sentence_tokens = self.sentence_tokenize(self.text_no_feed)
            self.sentence_count = len(self.sentence_tokens)
            self.passive_sentences = passive(self.text_no_feed)
            self.passive_sentence_count = len(self.passive_sentences)
            self.percent_passive = (100 * (float(self.passive_sentence_count) /
                                           float(self.sentence_count)))
            self.percent_passive_round = round(self.percent_passive, 2)

            self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens)
            self.be_verb_count = self.be_verb_analysis[0]
            self.weak_sentences_all = self.be_verb_analysis[1]
            self.weak_sentences_set = set(self.weak_sentences_all)
            self.weak_sentences_count = len(self.weak_sentences_set)
            self.weak_verbs_to_sentences = 100 * float(
                self.weak_sentences_count) / float(self.sentence_count)
            self.weak_verbs_to_sentences_round = round(
                self.weak_verbs_to_sentences, 2)
            self.word_tokens = self.word_tokenize(self.text_no_feed)
            self.word_tokens_no_punct = \
                self.word_tokenize_no_punct(self.text_no_feed)
            self.no_punct = self.strip_punctuation(self.text_no_feed)
            # use this! It make lower and strips symbols
            self.word_tokens_no_punct = self.ws_tokenize(self.no_punct)


            self.readability_flesch_re = \
                textstat.flesch_reading_ease(self.text_no_feed)
            self.readability_smog_index = \
                textstat.smog_index(self.text_no_feed)
            self.readability_flesch_kincaid_grade = \
                textstat.flesch_kincaid_grade(self.text_no_feed)
            self.readability_coleman_liau_index = \
                textstat.coleman_liau_index(self.text_no_feed)
            self.readability_ari = \
                textstat.automated_readability_index(self.text_no_feed)
            self.readability_linser_write = \
                textstat.linsear_write_formula(self.text_no_feed)
            self.readability_dale_chall = \
                textstat.dale_chall_readability_score(self.text_no_feed)
            self.readability_standard = \
                textstat.text_standard(self.text_no_feed)

            self.flesch_re_desc_str = self.flesch_re_desc(
                int(textstat.flesch_reading_ease(self.text_no_feed)))
            self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed)
            self.lexicon_count = textstat.lexicon_count(self.text_no_feed)
            self.avg_syllables_per_word = textstat.avg_syllables_per_word(
                self.text_no_feed)
            self.avg_sentence_per_word = textstat.avg_sentence_per_word(
                self.text_no_feed)
            self.avg_sentence_length = textstat.avg_sentence_length(
                self.text_no_feed)
            self.avg_letter_per_word = textstat.avg_letter_per_word(
                self.text_no_feed)
            self.difficult_words = textstat.difficult_words(self.text_no_feed)
            self.rand_passive = self.select_random(self.passive_sentence_count,
                                                   self.passive_sentences)
            self.rand_weak_sentence = self.select_random(
                len(self.weak_sentences), self.weak_sentences)
            if self.word_tokens_no_punct:
                self.word_count = len(self.word_tokens_no_punct)
                self.page_length = float(self.word_count) / float(250)
                self.paper_count = int(math.ceil(self.page_length))
                self.parts_of_speech = pos_tag(self.word_tokens_no_punct)
                self.pos_counts = Counter(
                    tag for word, tag in self.parts_of_speech)
                self.pos_total = sum(self.pos_counts.values())
                self.pos_freq = dict(
                    (word, float(count) / self.pos_total)
                    for word, count in self.pos_counts.items())
                self.doc_pages = float(float(self.word_count) / float(250))
                self.freq_words = \
                    self.word_frequency(self.word_tokens_no_punct)
                self.modal_dist = self.modal_count(self.word_tokens_no_punct)
                # self.ws_tokens = self.ws_tokenize(self.text_no_cr)
                self.pos_count_dict = self.pos_counts.items()

            # Model - use for any pos
            self.modals = self.pos_isolate('MD', self.pos_count_dict)
            self.preposition_count = self.pos_isolate('IN',
                                                      self.pos_count_dict)
            self.adjective_count = self.pos_isolate_fuzzy(
                'JJ', self.pos_count_dict)
            self.adverb_count = self.pos_isolate_fuzzy('RB',
                                                       self.pos_count_dict)
            self.proper_nouns = self.pos_isolate_fuzzy('NNP',
                                                       self.pos_count_dict)
            self.cc_count = self.pos_isolate('CC', self.pos_count_dict)
            self.commas = self.char_count(",")
            self.comma_sentences = self.list_sentences(",")
            self.comma_example = self.select_random(len(self.comma_sentences),
                                                    self.comma_sentences)
            self.semicolons = self.char_count(";")
            self.semicolon_sentences = self.list_sentences(";")
            self.semicolon_example = self.select_random(
                len(self.semicolon_sentences), self.semicolon_sentences)
            self.lint_suggestions = lint(self.raw_text)
예제 #22
0
def text_proc(corpus, urlDat={}, WORD_LIM=30, verbose=False):
    if type(corpus) is type(str()) and corpus not in str(
            "Redirecting"):  # and not str("privacy policy") in corpus:

        if str("some error has occurred while processing your request"
               ) in corpus:
            return {}
        if str("We apologize for the inconvenience...") in corpus:
            return {}
        # if np.mean([len(w) for w in corpus]) > 35:
        # 	return {}

        corpus = corpus.replace("/",
                                " ")  # remove characters that nltk can't read
        corpus = corpus.lower()
        corpus = corpus.replace(u"\xa0", u" ")
        corpus = corpus.replace(u"\\", u" ")
        corpus, this_is_science = extract_science_block(corpus)
        if "semantic" in urlDat.keys():
            if urlDat["semantic"]:
                this_is_science = True
        urlDat["science"] = this_is_science

        # print(corpus)
        # print(this_is_science, "this_is_science")
        urlDat["big_words"] = [word for word in corpus if len(word) > 16]
        ignoreSingleSentences = 1

        corpus = cleanup_pretagger_all(corpus)
        if verbose:
            st.text("pretagger all")
            st.text(type(corpus))

        tokens = word_tokenize(corpus)
        if verbose:
            st.text("token input")
            st.text(tokens)
        tokens = [t for t in tokens if t not in not_want_list]
        # if np.mean([len(t) for t in tokens]) > 50:
        # 	return {}
        # tokens = [t for t in tokens if len(t) < 50]
        # if verbose:
        # 	st.text("token input")
        # 	st.text(tokens)
        wc, sc, sylCount, remainingText, wordLen = countWordsSentSyl(
            tokens, ignoreSingleSentences=1)

        if len(tokens) < WORD_LIM:
            return {}
        if len(tokens) >= WORD_LIM:

            remainingText = " ".join(remainingText)
            remainingText = remainingText.lower()
            urlDat["standard"] = textstat.text_standard(remainingText,
                                                        float_output=True)
            # st.markdown(urlDat["standard"])
            if wc > 0 and sc > 0:
                if "semantic" in urlDat.keys() or urlDat["standard"] > 95:

                    # else:
                    #    urlDat["hard_snippet"] = None
                    urlDat["fre_unbiased"] = freeAlongtheText(corpus,
                                                              chunk_length=512)
                    fre = FRE(wc, sc, sylCount)

                    if "semantic" in urlDat.keys():
                        if urlDat["semantic"]:
                            ndc = NDC(
                                remainingText, wc, sc
                            )  # calc NDC Index and Perctage Diff Words                                         #calc NDC index
                # if not "fre_unbiased" in urlDat.keys() and urlDat["standard"]>100:
                meanv, total, hard_snippet = complexityAlongtheText(
                    corpus, chunk_length=256)
                urlDat["standard_unbiased"] = meanv
                # urlDat["standard"] = total
                # if this_is_science:
                if "semantic" in urlDat.keys():
                    urlDat["hard_snippet"] = hard_snippet

                    # urlDat["fre"] = fre  # textstat.text_standard(corpus, float_output=True)
                    # urlDat["standard"] = ndc[0]
                # https://stackoverflow.com/questions/62492797/get-bibliography-list-and-its-count-from-text-python
            # print(urlDat["standard"])
            """
            if "fre_unbiased" in urlDat.keys():
                if (
                    urlDat["fre_unbiased"] < urlDat["standard"]
                    and urlDat["fre_unbiased"] > 0
                ):
                    urlDat["standard"] = urlDat["fre_unbiased"]
                if urlDat["standard"] == 0 and urlDat["fre_unbiased"] > 0:
                    urlDat["standard"] = urlDat["fre_unbiased"]
            """
            # if (
            #    urlDat["standard_unbiased"] < urlDat["standard"]
            #    and urlDat["standard_unbiased"] > 0
            # ):
            #    urlDat["standard"] = urlDat["standard_unbiased"]
            # if fre<urlDat["standard"] and fre>0:
            #    urlDat["standard"] = fre
            # if urlDat["standard"] > 60 and ndc[0]>0 and ndc[0]<60:
            #    urlDat["standard"] = ndc[0]

            # urlDat["concensus"] = np.mean(
            # 	[
            # 		np.mean(fre),
            # 		np.mean(urlDat["standard_unbiased"]),
            # 	]
            # )
            tokens = [w.lower() for w in tokens if w.isalpha()]
            tokens = [w.lower() for w in tokens]  # make everything lower case
            urlDat["wcount"] = textstat.lexicon_count(str(tokens))
            word_lim = bool(urlDat["wcount"] > WORD_LIM)
            # print(urlDat["tokens"])

            if len(tokens):
                if "semantic" in urlDat.keys():
                    urlDat["tokens"] = tokens

                lexicon = textstat.lexicon_count(corpus, True)
                urlDat["uniqueness"] = len(set(tokens)) / float(len(tokens))
                urlDat["unique_words"] = len(set(tokens))

                # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate.
                # big deltas mean redudancy/sparse information/information/density

                testimonial = TextBlob(corpus)
                urlDat["sp"] = testimonial.sentiment.polarity
                urlDat["ss"] = testimonial.sentiment.subjectivity
                urlDat["sp_norm"] = np.abs(testimonial.sentiment.polarity)
                urlDat["ss_norm"] = np.abs(testimonial.sentiment.subjectivity)
                urlDat["gf"] = textstat.gunning_fog(corpus)
    if "standard" in urlDat.keys():
        if urlDat["standard"] == 0:
            if verbose:
                st.text("gets here")
            # return {}

    return urlDat
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 31 15:25:10 2016

@author: megan
"""

from textstat.textstat import textstat as ts

fname = 'actbacFB.txt'

with open(fname, 'r', encoding='utf-8') as f:
    data = f.read().replace('\n', '')

total = ts.lexicon_count(data)
difficult = ts.difficult_words(data)
fkre = ts.flesch_reading_ease(data)
grade = ts.flesch_kincaid_grade(data)
overall = ts.text_standard(data)

print("Total words:", total)
print("Difficult words:", difficult)
print("FKRE:", fkre)
print("Grade:", grade)
print("Overall readability", overall)
예제 #24
0
def text_proc(corpus, urlDat={}, WORD_LIM=100):

    #remove unreadable characters
    if type(corpus) is str and str('privacy policy') not in corpus:
        corpus = corpus.replace("-",
                                " ")  #remove characters that nltk can't read
        textNum = re.findall(
            r'\d', corpus)  #locate numbers that nltk cannot see to analyze
        tokens = word_tokenize(corpus)

        stop_words = stopwords.words('english')
        #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.

        tokens = [word for word in tokens if not word in stop_words]
        tokens = [w.lower() for w in tokens]  #make everything lower case

        # the kind of change that might break everything
        urlDat['wcount'] = textstat.lexicon_count(str(tokens))
        word_lim = bool(urlDat['wcount'] > WORD_LIM)

        ## Remove the search term from the tokens somehow.
        urlDat['tokens'] = tokens

        if 'big_model' in urlDat.keys():
            urlDat['perplexity'] = perplexity(corpus, urlDat['big_model'])
        else:
            urlDat['perplexity'] = None
        # Word limits can be used to filter out product merchandise websites, which otherwise dominate scraped results.
        # Search engine business model is revenue orientated, so most links will be for merchandise.

        urlDat['publication'] = publication_check(str(tokens))[1]
        urlDat['clue_words'] = clue_words(str(tokens))[1]
        if str('link') in urlDat.keys():
            urlDat['clue_links'] = clue_links(urlDat['link'])[1]

            temp = len(urlDat['clue_words']) + len(
                urlDat['publication']) + len(urlDat['clue_links'])
            if temp > 10 and str('wiki') not in urlDat['link']:
                urlDat['science'] = True
            else:
                urlDat['science'] = False
            if str('wiki') in urlDat['link']:
                urlDat['wiki'] = True
            else:
                urlDat['wiki'] = False
        # The post modern essay generator is so obfuscated, that ENGLISH classification fails, and this criteria needs to be relaxed.
        not_empty = bool(len(tokens) != 0)

        if not_empty and word_lim:  #  and server_error:

            tokens = [w.lower() for w in tokens if w.isalpha()]
            #fdist = FreqDist(tokens) #frequency distribution of words only
            # The larger the ratio of unqiue words to repeated words the more colourful the language.
            lexicon = textstat.lexicon_count(corpus, True)
            urlDat['uniqueness'] = len(set(tokens)) / float(len(tokens))
            # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate.
            # big deltas mean redudancy/sparse information/information/density

            urlDat['info_density'] = comp_ratio(corpus)

            #Sentiment and Subjectivity analysis
            testimonial = TextBlob(corpus)
            urlDat['sp'] = testimonial.sentiment.polarity
            urlDat['ss'] = testimonial.sentiment.subjectivity
            urlDat['sp_norm'] = np.abs(testimonial.sentiment.polarity)
            urlDat['ss_norm'] = np.abs(testimonial.sentiment.subjectivity)
            urlDat['gf'] = textstat.gunning_fog(corpus)

            # explanation of metrics
            # https://github.com/shivam5992/textstat

            urlDat['standard'] = textstat.text_standard(corpus,
                                                        float_output=True)
            #urlDat['standard_'] = copy.copy(urlDat['standard'] )
            # special sauce
            # Good writing should be readable, objective, concise.
            # The writing should be articulate/expressive enough not to have to repeat phrases,
            # thereby seeming redundant. Articulate expressive writing then employs
            # many unique words, and does not yield high compression savings.
            # Good writing should not be obfucstated either. The reading level is a check for obfucstation.
            # The resulting metric is a balance of concision, low obfucstation, expression.

            wc = float(1.0 / urlDat['wcount'])
            # compressed/uncompressed. Smaller is better.
            # as it means writing was low entropy, redundant, and easily compressible.
            urlDat['scaled'] = wc * urlDat['standard']
            urlDat['conciseness'] = urlDat['wcount']*(urlDat['uniqueness']) + \
            urlDat['wcount']*(urlDat['info_density'])

            urlDat['conciseness'] = bi_log_value(urlDat['conciseness'])
            if urlDat['perplexity'] is not None:
                urlDat['perplexity'] = bi_log_value(urlDat['perplexity'])

                penalty = (urlDat['standard'] + urlDat['conciseness']+\
                urlDat['scaled'] + urlDat['perplexity'])/4.0
            else:
                penalty = (urlDat['standard'] + urlDat['conciseness'] +
                           urlDat['scaled']) / 3.0

            #computes perplexity of the unigram model on a testset
            urlDat['penalty'] = penalty

        return urlDat
예제 #25
0
def text_proc(corpus, urlDat = {}, WORD_LIM = 100):
    #r emove unreadable characters
    corpus = corpus.replace("-", " ") #remove characters that nltk can't read
    textNum = re.findall(r'\d', corpus) #locate numbers that nltk cannot see to analyze
    tokens = word_tokenize(corpus)
    tokens = [w.lower() for w in tokens] #make everything lower case
    # the kind of change that might break everything

    urlDat['wcount'] = textstat.lexicon_count(str(tokens))
    word_lim = bool(urlDat['wcount']  > WORD_LIM)

    # Word limits can be used to filter out product merchandise websites, which otherwise dominate scraped results.
    # Search engine business model is revenue orientated, so most links will be for merchandise.

    try:
        urlDat['english'] = english_check(corpus)
        urlDat['clue_words'] = clue_words(corpus)
        urlDat['clue_links'] = clue_links(urlDat['link'])
    except:
        urlDat['english'] = True
        urlDat['clue_words'] = (False,[])
        urlDat['clue_links'] = (False,[])

    # The post modern essay generator is so obfuscated, that ENGLISH classification fails, and this criteria needs to be relaxed.
    not_empty = bool(len(tokens) != 0)
    if not_empty and urlDat['english'] and word_lim: #  and server_error:

        tokens = [ w.lower() for w in tokens if w.isalpha() ]
        #fdist = FreqDist(tokens) #frequency distribution of words only
        # The larger the ratio of unqiue words to repeated words the more colourful the language.
        lexicon = textstat.lexicon_count(corpus, True)
        urlDat['uniqueness'] = len(set(tokens))/float(len(tokens))
        # It's harder to have a good unique ratio in a long document, as 'and', 'the' and 'a', will dominate.
        # big deltas mean redudancy/sparse information/information/density

        # Rationale this metric.
        # Different papers and diffferent scientific concepts,
        # incur very different degrees of irreducible complexity
        # intrinsic to the complexity of the concepts they are tasked with communicating.

        # Assumption 1: the stanford analysis is too basic to accomodate for differences in
        # intrinsic complexity of concepts
        # Assumption 2: Information theory may be sensitive to intrinsic irreducible complexity

        urlDat['info_density'] =  comp_ratio(corpus)

        # Fudge factor:
        # The log should be moved to plotting.
        #scaled_density = -1.0 * abs(urlDat['info_density'] * (1.0/urlDat['wcount']))
        #urlDat['scaled_info_density'] = scaled_density

        #Sentiment and Subjectivity analysis
        testimonial = TextBlob(corpus)
        urlDat['sp'] = testimonial.sentiment.polarity
        urlDat['ss'] = testimonial.sentiment.subjectivity
        urlDat['gf'] = textstat.gunning_fog(corpus)

        # explanation of metrics
        # https://github.com/shivam5992/textstat

        standard_  = textstat.text_standard(corpus)
        try:
            urlDat['standard']  = float(standard_[0:2])
        except:
            urlDat['standard']  = float(standard_[0:1])

        # special sauce
        # Good writing should be readable, objective, concise.
        # The writing should be articulate/expressive enough not to have to repeat phrases,
        # thereby seeming redundant. Articulate expressive writing then employs
        # many unique words, and does not yield high compression savings.
        # Good writing should not be obfucstated either. The reading level is a check for obfucstation.
        # The resulting metric is a balance of concision, low obfucstation, expression.
        penalty = urlDat['standard']  + abs(urlDat['sp']) + abs(urlDat['ss']) # +float(scaled_density)
        urlDat['penalty'] = penalty
    return urlDat
print(textstat.flesch_reading_ease(test_data))
print("The SMOG Index")
print("Texts of fewer than 30 sentences are statistically invalid, "
      "because the SMOG formula was normed on 30-sentence samples.")
print("textstat requires atleast 3 sentences for a result.")
print(textstat.smog_index(test_data))
print("The Flesch-Kincaid Grade")
print(textstat.flesch_kincaid_grade(test_data))
print("The Coleman-Liau Index")
print(textstat.coleman_liau_index(test_data))
print("Automated Readability Index (ARI)")
print(textstat.automated_readability_index(test_data))
print("Dale-Chall Readability Score")
print(textstat.dale_chall_readability_score(test_data))
print("Linsear Write Formula")
print(textstat.linsear_write_formula(test_data))
print("The Fog Scale (Gunning FOG Formula)")
print(textstat.gunning_fog(test_data))
print(
    "---------------------------------Summary----------------------------------"
)
print("Readability Consensus based upon all the above tests")
print(
    "Based upon all the above tests, "
    "returns the estimated school grade level required to understand the text."
)
print(textstat.text_standard(test_data))
print(
    "--------------------------------------------------------------------------"
)
예제 #27
0
    '../output_text/trump_out.txt', '../output_text/shakespeare_out.txt',
    '../output_text/drseuss_out.txt'
]

# input_file_names = ['../data_parsed/trump.txt',
input_file_names = [
    '../data_parsed/shakespeare.txt', '../data_parsed/drseuss.txt'
]

for i in range(0, len(input_file_names)):
    input_file_name = input_file_names[i]
    print(input_file_name)
    with open(input_file_name, 'r') as myfile:
        test_data = myfile.read().replace('\n', '')

    print "flesch_reading_ease: " + str(
        textstat.flesch_reading_ease(test_data))
    print "smog_index: " + str(textstat.smog_index(test_data))
    print "flesch_kincaid_grade: " + str(
        textstat.flesch_kincaid_grade(test_data))
    print "coleman_liau_index: " + str(textstat.coleman_liau_index(test_data))
    print "automated_readability_index: " + str(
        textstat.automated_readability_index(test_data))
    print "dale_chall_readability_score: " + str(
        textstat.dale_chall_readability_score(test_data))
    print "difficult_words: " + str(textstat.difficult_words(test_data))
    print "linsear_write_formula: " + str(
        textstat.linsear_write_formula(test_data))
    print "gunning_fog: " + str(textstat.gunning_fog(test_data))
    print "text_standard: " + str(textstat.text_standard(test_data))
예제 #28
0
        v = cr[k]
        gl = []
        for s in tqdm(v):
            if (gl == []):
                gl.append(textstat.flesch_kincaid_grade(s) / len(v))
                gl.append(textstat.smog_index(s) / len(v))
                gl.append(textstat.automated_readability_index(s) / len(v))
                gl.append(textstat.dale_chall_readability_score(s) / len(v))
                gl.append(textstat.coleman_liau_index(s) / len(v))
                gl.append(textstat.linsear_write_formula(s) / len(v))
                gl.append(textstat.gunning_fog(s) / len(v))
            else:
                gl[0] += textstat.flesch_kincaid_grade(s) / len(v)
                gl[1] += textstat.smog_index(s) / len(v)
                gl[2] += textstat.automated_readability_index(s) / len(v)
                gl[3] += textstat.dale_chall_readability_score(s) / len(v)
                gl[4] += textstat.coleman_liau_index(s) / len(v)
                gl[5] += textstat.linsear_write_formula(s) / len(v)
                gl[6] += textstat.gunning_fog(s) / len(v)
        t = ""
        for s in v:
            t += s
        gl.append(textstat.text_standard(t))
        data.append([k] + gl)
    except:
        print "null"

with open('speaker_map_all.csv', 'w') as file:
    writer = csv.writer(file)
    writer.writerows(data)
def get_grade_level(block):
	consensus = textstat.text_standard(block)
	return float(consensus[0]) + .5
    testimonial = TextBlob(soup_nocode)

    polarity_val = testimonial.sentiment.polarity
    subjectivity_val = testimonial.sentiment.subjectivity
    #print("\nAverage sentiment::\n")
    #print(testimonial.sentiment)
    # for t in testimonial.sentences:
    # 	print("\nsentiment :::\n ")
    # 	print(t)
    # 	print(t.sentiment)

    #calculate readability
    #print("\nAverage readability::\n")
    if (len(soup_nohtml) > 1):
        readability_val = textstat.text_standard(soup_nohtml)
        readability_score = readability_val.split("th")[0]
    else:
        readability_score = 0

#get the number of the lower grade

#print(readability_val)
#print(readability_score)

# update values

# alterando os dados da tabela

    cursor.execute(
        """
예제 #31
0
            FILES = [FILE_OR_DIR]

        for FILE in FILES:
            print 'Processing', FILE
            TEXT = read_file(FILE)

            print 'Flesh reading ease', textstat.flesch_reading_ease(TEXT)
            print 'Smog index', textstat.smog_index(TEXT)
            print 'Flesch Kincaid grade', textstat.flesch_kincaid_grade(TEXT)
            print 'Coleman Liau', textstat.flesch_kincaid_grade(TEXT)
            print 'Automated readability index', textstat.automated_readability_index(TEXT)
            print 'Dale Chall readability score', textstat.dale_chall_readability_score(TEXT)
            print 'Difficult words', textstat.difficult_words(TEXT)
            print 'Linsear write formula', textstat.linsear_write_formula(TEXT)
            print 'Gunning fog', textstat.gunning_fog(TEXT)
            print 'Text standard', textstat.text_standard(TEXT)

            print '\nWords'
            WORDS = get_words(TEXT)
            get_word_stats(WORDS)

            print '\nWords no Stop Words'
            WORDS_NO_STOP = [w for w in WORDS if w not in stop]
            get_word_stats(WORDS_NO_STOP)

            print '\nSentences'
            SENTENCES = get_sentences(TEXT)
            get_sentence_stats(SENTENCES)
            print

            WORD_SETS[FILE_OR_DIR] |= set(WORDS)
예제 #32
0
파일: test.py 프로젝트: zaork/textstat
    def test_text_standard(self):
        standard = textstat.text_standard(self.long_test)

        self.assertEqual("9th and 10th grade", standard)
예제 #33
0
    def updateData(self):

        # Full list of polarity scores
        self.polscore = self.sid.polarity_scores(self.text)

        ##### INDEX 0 IN DATA: Text Sentiment #####
        # [INDEX 0] Compounded score (0.0 - 1.0)            [INDEX 1] Negative connotation rating (0.0 - 1.0),
        # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0)
        self.data.append([
            self.polscore['compound'], self.polscore['neg'],
            self.polscore['pos'], self.polscore['neu']
        ])

        ##### INDEX 1 IN DATA: Sentence Info #####
        # [INDEX 0] Sentence count          [INDEX 1] Average sentence length
        # [INDEX 2] Syllable count          [INDEX 3] Overall word count
        # [INDEX 4] Character count         [INDEX 5] Character count without spaces
        # [INDEX 6] Avg letters per word    [INDEX 7] Avg syllables per word
        self.data.append([
            textstat.sentence_count(self.text),
            textstat.avg_sentence_length(self.text),
            textstat.syllable_count(self.text),
            len(self.splList),
            textstat.char_count(self.text, False),
            textstat.char_count(self.text, True),
            textstat.avg_letter_per_word(self.text),
            textstat.avg_syllables_per_word(self.text)
        ])

        ##### INDEX 2 IN DATA: Flesch Reading Ease #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 100
        self.freRaw = textstat.flesch_reading_ease(self.text)
        self.freStat = min(max(self.freRaw, 0), 100)
        self.data.append([
            round(self.freStat, 3),
            self.freGrade(self.freStat),
            round(abs(self.freStat - 100), 2)
        ])

        ##### INDEX 3 IN DATA: Flesch-Kincaid Grade #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fkgRaw = textstat.flesch_kincaid_grade(self.text)
        self.fkgStat = self.adjustScore(self.fkgRaw)
        self.data.append([
            round(self.fkgStat, 3),
            self.grade(self.fkgStat),
            round(self.fkgStat / 0.18, 2)
        ])

        ##### INDEX 4 IN DATA: Gunning FOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fogRaw = textstat.gunning_fog(self.text)
        self.fogStat = self.adjustScore(self.fogRaw)
        self.data.append([
            round(self.fogStat, 3),
            self.grade(self.fogStat),
            round(self.fogStat / 0.18, 2)
        ])

        ##### INDEX 5 IN DATA: SMOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.smogRaw = textstat.smog_index(self.text)
        self.smogStat = self.adjustScore(self.smogRaw)
        self.data.append([
            round(self.smogStat, 3),
            self.grade(self.smogStat),
            round(self.smogStat / 0.18, 2)
        ])

        ##### INDEX 6 IN DATA: Automated Readability Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 14
        self.ariRaw = textstat.automated_readability_index(self.text)
        self.ariStat = min(max(self.ariRaw, 0), 14)
        self.data.append([
            round(self.ariStat, 3),
            self.ariGrade(ceil(self.ariStat)),
            round(self.ariStat / 0.14, 2)
        ])  #13

        ##### INDEX 7 IN DATA: Coleman-Liau Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.cliRaw = textstat.coleman_liau_index(self.text)
        self.cliStat = self.adjustScore(self.cliRaw)
        self.data.append([
            round(self.cliStat, 3),
            self.grade(self.cliStat),
            round(self.cliStat / 0.18, 2)
        ])

        ##### INDEX 8 IN DATA: Linsear Write Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.lwiRaw = textstat.linsear_write_formula(self.text)
        self.lwiStat = self.adjustScore(self.lwiRaw)
        self.data.append([
            round(self.lwiStat, 3),
            self.grade(self.lwiStat),
            round(self.lwiStat / 0.18, 2)
        ])

        ##### INDEX 9 IN DATA: Dale-Chall Readability Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 10
        self.dcrRaw = textstat.dale_chall_readability_score(self.text)
        self.dcrStat = min(max(self.dcrRaw, 0), 10)
        self.data.append([
            round(self.dcrStat, 3),
            self.daleChallGrade(self.dcrStat),
            round(self.dcrStat / 0.1, 2)
        ])

        ##### INDEX 10 IN DATA: Overall Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 20
        self.txtRaw = textstat.text_standard(self.text, True)
        self.txtStd = min(max(self.txtRaw, 0), 20)
        self.txtInfo = textstat.text_standard(self.text)
        self.data.append([
            round(self.txtStd, 3),
            self.txtGrade(self.txtStd, self.txtInfo),
            round(self.txtStd / 0.2, 2)
        ])

        return self.data
 def reading_level_comp(string):
     try:
         level = textstat.text_standard(string)
         return level
     except:
         return "Unclear"