def lex_readability(self, text, mode='fre'):

        if mode == 'all':
            fre_score = textstat.flesch_reading_ease(text)
            fog_index = textstat.gunning_fog(text)
            fkg_index = textstat.flesch_kincaid_grade(text)
            dcr_score = textstat.dale_chall_readability_score(text)
            text_standard = textstat.text_standard(text, float_output=True)
            return fre_score, fog_index, fkg_index, dcr_score, text_standard

        if mode == 'fre':
            fre_score = textstat.flesch_reading_ease(text)
            return fre_score

        if mode == 'fog':
            fog_index = textstat.gunning_fog(text)
            return fog_index

        if mode == 'fkg':
            fkg_index = textstat.flesch_kincaid_grade(text)
            return fkg_index

        if mode == 'dcr':
            dcr_score = textstat.dale_chall_readability_score(text)
            return dcr_score

        if mode == 'text_std':
            text_standard = textstat.text_standard(text, float_output=True)
            return text_standard
示例#2
0
def seven_test(processed_essay):
    """
    score which is assigned to every script in on the basis of some predifened fomulas
    These scores are known as readability score.
    flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write
    :param processed_essay:
    :return:flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write
    """
    flesch_score = ["FS"]
    gunning_index = ["GI"]
    kincaid_grade = ["KG"]
    liau_index = ["LI"]
    automated_readability_index = ["ARI"]
    dale_readability_score = ["DLS"]
    difficult_word = ["DW"]
    linsear_write = ["LW"]
    for v in processed_essay:
        flesch_score.append(textstat.flesch_reading_ease(str(v)))
        gunning_index.append(textstat.gunning_fog(str(v)))
        kincaid_grade.append(textstat.flesch_kincaid_grade(str(v)))
        liau_index.append(textstat.coleman_liau_index(str(v)))
        automated_readability_index.append(textstat.automated_readability_index(str(v)))
        dale_readability_score.append(textstat.dale_chall_readability_score(str(v)))
        difficult_word.append(textstat.difficult_words(str(v)))
        linsear_write.append(textstat.linsear_write_formula(str(v)))
    return flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write
def readability(text):
    """
    Provides the readability grade for the text. Here we are using the
    flesch reading ease score. Higher the score, easier to read

    text: input text on which score has to be calculated
    """
    score = textstat.flesch_reading_ease(text)
    grade = round(textstat.flesch_kincaid_grade(text))

    if score > 90:
        summary = "Very easy to read. Easily understood by an average 11-year-old student; "
    elif score > 80:
        summary = "Easy to read. Conversational English for consumers"
    elif score > 70:
        summary = "Fairly easy to read"
    elif score > 60:
        summary = "Plain English. Easily understood by 13- to 15-year-old students."
    elif score > 50:
        summary = "Fairly difficult to read."
    elif score > 30:
        summary = "Difficult to read"
    else:
        summary = "Very difficult to read. Best understood by university graduates."

    return score, summary, grade
示例#4
0
def analyze():
    print(request)
    str_to_read = request.data.decode("utf-8").strip()

    report = {
        "flesch-reading-ease":
        textstat.flesch_reading_ease(str_to_read),
        "smog-index":
        textstat.smog_index(str_to_read),
        "flesch-kincaid-grade":
        textstat.flesch_kincaid_grade(str_to_read),
        "coleman-liau-index":
        textstat.coleman_liau_index(str_to_read),
        "automated-readability-index":
        textstat.automated_readability_index(str_to_read),
        "dale-chall-readability-score":
        textstat.dale_chall_readability_score(str_to_read),
        "difficult-words":
        textstat.difficult_words(str_to_read),
        "linsear-write-formula":
        textstat.linsear_write_formula(str_to_read),
        "gunning-fog":
        textstat.gunning_fog(str_to_read),
        "text-standard":
        textstat.text_standard(str_to_read)
    }
    return decorate_response(jsonify(report))
示例#5
0
文件: DE_main.py 项目: zzs-NLP/ACS-QG
def get_readibility(text, metric="flesch_kincaid_grade"):
    """
    Return a score which reveals a piece of text's readability level.
    Reference: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html
               https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    """
    if metric == "flesch_kincaid_grade":
        result = textstat.flesch_kincaid_grade(text)
    elif metric == "flesch_reading_ease":
        result = textstat.flesch_reading_ease(text)
    elif metric == "smog_index":
        result = textstat.smog_index(text)
    elif metric == "coleman_liau_index":
        result = textstat.coleman_liau_index(text)
    elif metric == "automated_readability_index":
        result = textstat.automated_readability_index(text)
    elif metric == "dale_chall_readability_score":
        result = textstat.dale_chall_readability_score(text)
    elif metric == "difficult_words":
        result = textstat.difficult_words(text)
    elif metric == "linsear_write_formula":
        result = textstat.linsear_write_formula(text)
    elif metric == "gunning_fog":
        result = textstat.gunning_fog(text)
    elif metric == "text_standard":
        result = textstat.text_standard(text)
    else:
        print("ERROR: Please select correct metric!")
        result = None
    return result
示例#6
0
def calculate_stats(data_folder):
    """Calculate stat of test.json file in a folder"""
    data_folder = Path(data_folder)
    for dataset in dataset_fields:
        print(f"loading {dataset}")
        field = dataset_fields[dataset]["text"].strip()
        sentences = []
        for item in json.load(open(data_folder / dataset / "test.json")):
            sentences.append(item[field][-1] if type(item[field]) == list else item[field])

        text = " ".join(sentences)
        lex_count = textstat.lexicon_count(text)
        print(lex_count)
        unique_words = count_words(text)
        print(f"all unique {len(unique_words)}")

        lower_unique_words = count_words(text, casing="lower")
        print(f"lowercase unique {len(lower_unique_words)}")

        upper_unique_words = count_words(text, casing="upper")
        print(f"uppercase unique {len(upper_unique_words)}")

        print(f"ratio {len(upper_unique_words) / len(unique_words)}")

        text_standard = textstat.text_standard(text, float_output=True)
        print(f"text_standard: {text_standard}")

        dale_chall_readability_score = textstat.dale_chall_readability_score(text)
        print(f"dale_chall_readability_score: {dale_chall_readability_score}")

        flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
        print(f"flesch_kincaid_grade: {flesch_kincaid_grade}")
示例#7
0
def readability(queries):
    scores = pd.DataFrame(columns=[
        'Flesch', 'Smog', 'Flesch grade', 'Coleman', 'Automated', 'Dale',
        'Difficult', 'Linsear', 'Gunning', 'Text Standard'
    ])

    scores = {
        'Flesch': [],
        'Smog': [],
        'Flesch grade': [],
        'Coleman': [],
        'Automated': [],
        'Dale': [],
        'Difficult': [],
        'Linsear': [],
        'Gunning': [],
        'Text Standard': []
    }
    for line in queries:
        # results = readability.getmeasures(line, lang='en')
        # frescores.append(results['readability grades']['FleschReadingEase'])
        # line = 'yao family wines . yao family wines is a napa valley producer founded in 2011 by yao ming , the chinese-born , five-time nba all star . now retired from the houston rockets , yao ming is the majority owner in yao family wines , which has entered the wine market with a luxury cabernet sauvignon sourced from napa valley vineyards .'
        scores['Flesch'].append(textstat.flesch_reading_ease(line))
        scores['Smog'].append(textstat.smog_index(line))
        scores['Flesch grade'].append(textstat.flesch_kincaid_grade(line))
        scores['Coleman'].append(textstat.coleman_liau_index(line))
        scores['Automated'].append(textstat.automated_readability_index(line))
        scores['Dale'].append(textstat.dale_chall_readability_score(line))
        scores['Difficult'].append(textstat.difficult_words(line))
        scores['Linsear'].append(textstat.linsear_write_formula(line))
        scores['Gunning'].append(textstat.gunning_fog(line))
        scores['Text Standard'].append(
            textstat.text_standard(line, float_output=True))

    return scores
def fleschkincaid() -> List:
    """returns Flesch-Kincaid score
    """
    score = []
    for text in policies['Policy']:
        score.append(textstat.flesch_kincaid_grade(text))
    return score
示例#9
0
def main():
    df = pd.DataFrame(columns=['Utility', 'FK Score', 'FK Grade Level'])

    for x in glob.glob('pdfs/*.pdf'):
        try:
            text = parser.from_file(x)
            df = df.append(
                {
                    'Utility':
                    str(x).split('\\')[1].split('.')[0],
                    'FK Score':
                    textstat.flesch_reading_ease(text['content']),
                    'FK Grade Level':
                    textstat.flesch_kincaid_grade(text['content'])
                },
                ignore_index=True)
        except:
            df = df.append(
                {
                    'Utility': str(x).split('\\')[1].split('.')[0],
                    'FK Score': 'N/A',
                    'FK Grade Level': 'N/A'
                },
                ignore_index=True)

    df.to_csv('data/results/readability_results.csv', encoding='utf-8')
示例#10
0
def textstat_stats(text):
    doc_length = len(text.split()) 
    flesch_ease = ts.flesch_reading_ease(text) #Flesch Reading Ease Score
    flesch_grade = ts.flesch_kincaid_grade(text) #Flesch-Kincaid Grade Level
    gfog = ts.gunning_fog(text) # FOG index, also indicates grade level
#    smog = ts.smog_index(text) # SMOG index, also indicates grade level, only useful on 30+ sentences
    auto_readability = ts.automated_readability_index(text) #approximates the grade level needed to comprehend the text.
    cl_index = ts.coleman_liau_index(text) #grade level of the text using the Coleman-Liau Formula.
    lw_formula = ts.linsear_write_formula(text) #grade level using the Linsear Write Formula.
    dcr_score = ts.dale_chall_readability_score(text) #uses a lookup table of the most commonly used 3000 English words
#    text_standard = ts.text_standard(text, float_output=False) # summary of all the grade level functions
    syll_count = ts.syllable_count(text, lang='en_US')
    syll_count_scaled = syll_count / doc_length
    lex_count = ts.lexicon_count(text, removepunct=True)
    lex_count_scaled = lex_count / doc_length
    idx = ['flesch_ease', 'flesch_grade','gfog',
           'auto_readability','cl_index','lw_formula',
           'dcr_score', 
#           'text_standard', 
           'syll_count', 'lex_count']
    return pd.Series([flesch_ease, flesch_grade, gfog, 
                      auto_readability, cl_index, lw_formula, 
                      dcr_score, 
#                      text_standard, 
                      syll_count_scaled, lex_count_scaled], index = idx)
示例#11
0
def get_stats(text):
    fre = textstat.flesch_reading_ease(text)
    smog = textstat.smog_index(text)
    fkg = textstat.flesch_kincaid_grade(text)
    cli = textstat.coleman_liau_index(text)
    ari = textstat.automated_readability_index(text)
    dcr = textstat.dale_chall_readability_score(text)
    diff_words = textstat.difficult_words(text)
    lwf = textstat.linsear_write_formula(text)
    gunn_fog = textstat.gunning_fog(text)
    consolidated_score = textstat.text_standard(text)

    doc_length = len(text)  # think about excluding spaces?
    quote_count = text.count('"')

    stats = {
        "flesch_reading_ease": fre,
        "smog_index": smog,
        "flesch_kincaid_grade": fkg,
        "coleman_liau_index": cli,
        "automated_readability_index": ari,
        "dale_chall_readability_score": dcr,
        "difficult_words": diff_words,
        "linsear_write_formula": lwf,
        "gunning_fog": gunn_fog,
        "consolidated_score": consolidated_score,
        "doc_length": doc_length,
        "quote_count": quote_count
    }
    return stats
示例#12
0
def getReadabilityMetrics(test_data):
    '''
        for a given article IN TEXT FORMAT, returns its readability metrics
        Uses textstat library, please install it
    '''
    metric = {
        "flesch_reading_ease":
        textstat.flesch_reading_ease(test_data),
        "smog_index":
        textstat.smog_index(test_data),
        "flesch_kincaid_grade":
        textstat.flesch_kincaid_grade(test_data),
        "coleman_liau_index":
        textstat.coleman_liau_index(test_data),
        "automated_readability_index":
        textstat.automated_readability_index(test_data),
        "dale_chall_readability_score":
        textstat.dale_chall_readability_score(test_data),
        "difficult_words":
        textstat.difficult_words(test_data),
        "linsear_write_formula":
        textstat.linsear_write_formula(test_data),
        "gunning_fog":
        textstat.gunning_fog(test_data),
        "text_standard":
        textstat.text_standard(test_data)
    }
    return metric
示例#13
0
def get_reading_level(html):
    '''
    Returns the Flesch-Kincaid Grade of the given text. This is a grade
    formula in that a score of 9.3 means that a ninth grader would be able to
    read the document.
    https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level
    '''
    return textstat.flesch_kincaid_grade(get_text(html))
示例#14
0
def score(text):
    a = textstat.flesch_reading_ease(text)
    b = textstat.flesch_kincaid_grade(text)
    c = textstat.gunning_fog(text)
    d = textstat.smog_index(text)
    e = textstat.coleman_liau_index(text)
    f = textstat.automated_readability_index(text)
    return a, b, c, d, e, f
示例#15
0
def readability_scores_mp(data):
    result_dict, idx, text = data

  #  flesch_reading_ease =  textstat.flesch_reading_ease(text)
    flesch_kincaid_grade =  textstat.flesch_kincaid_grade(text)
    dale_chall_readability_score =  textstat.dale_chall_readability_score(text) 

    result_dict[idx] = [flesch_kincaid_grade, dale_chall_readability_score]
def do_datas():
    # logging.info('do_datas')

    ########### Save text statistics
    ##### 1. nw 2. nvocab 3. nsyllable 4.nsentence 5. tone 6. readability
    ## 1. nw
    nw.append(len(words))
    ## 2. nvocab
    nvocab.append(len(vocab))
    ## 3. syllable
    n = textstat.syllable_count(contents)
    nsyllable.append(n)
    ## 4. sentence
    n = textstat.sentence_count(contents)
    nsentence.append(n)
    ## 5. tone
    ### LM dictionary
    n_neg_lm.append(count_occurrence(words, lm_neg))
    n_pos_lm.append(count_occurrence(words, lm_pos))
    n_uctt_lm.append(count_occurrence(words, lm_uctt))
    n_lit_lm.append(count_occurrence(words, lm_lit))
    n_cstr_lm.append(count_occurrence(words, lm_cstr))
    n_modal1_lm.append(count_occurrence(words, lm_modal1))
    n_modal2_lm.append(count_occurrence(words, lm_modal2))
    n_modal3_lm.append(count_occurrence(words, lm_modal3))
    n_negation_lm.append(count_negation(words, lm_pos, gt_negation))
    ### General Inquirer dictionary
    n_neg_gi.append(count_occurrence(words, gi_neg))
    n_pos_gi.append(count_occurrence(words, gi_pos))
    n_negation_gi.append(count_negation(words, gi_pos, gt_negation))
    ### Henry dictionary
    n_neg_hr.append(count_occurrence(words, hr_neg))
    n_pos_hr.append(count_occurrence(words, hr_pos))
    n_negation_hr.append(count_negation(words, gi_pos, gt_negation))
    ## 4. readability
    fre_i = textstat.flesch_reading_ease(contents)
    if fre_i > 100:
        fre_i = 100
    if fre_i < 0:
        fre_i = float('NaN')
    fre.append(fre_i)
    fkg_i = textstat.flesch_kincaid_grade(contents)
    if fkg_i < 0:
        fkg_i = float('NaN')
    fkg.append(fkg_i)
    # RIX
    cl_i = textstat.coleman_liau_index(contents)
    if cl_i < 0:
        cl_i = float('NaN')
    cl.append(cl_i)
    f = textstat.gunning_fog(contents)
    fog.append(f)
    f = textstat.automated_readability_index(contents)
    ari.append(f)
    f = textstat.smog_index(contents)
    smog.append(f)
示例#17
0
def metrics(sentence):
    fk = round(flesch_kincaid_grade(sentence), 3)
    gf = round(gunning_fog(sentence), 3)
    dc = round(dale_chall_readability_score(sentence), 3)

    fk_label = grade_label(round(fk))
    gf_label = grade_label(round(gf))
    dc_label = grade_label(dale_chall_norm(round(dc)))

    return (fk, gf, dc, fk_label, gf_label, dc_label)
示例#18
0
    def doc_calc(self, article):
        """Helper code to compute average word length of a name"""
        flesch_ease = textstat.flesch_reading_ease(article)
        flesch_grade = textstat.flesch_kincaid_grade(article)
        gunning = textstat.gunning_fog(article)
        profanity = predict_prob([article])[0]
        polarity = TextBlob(article).sentiment.polarity

        return pd.Series(
            [flesch_ease, flesch_grade, gunning, profanity, polarity])
示例#19
0
def compute_readability_stats(text):
    """
    Compute reading statistics of the given text
    Reference: https://github.com/shivam5992/textstat

    Parameters
    ==========
    text: str, input section or abstract text
    """
    try:
        readability_dict = {
            'flesch_reading_ease':
            textstat.flesch_reading_ease(text),
            'smog':
            textstat.smog_index(text),
            'flesch_kincaid_grade':
            textstat.flesch_kincaid_grade(text),
            'coleman_liau_index':
            textstat.coleman_liau_index(text),
            'automated_readability_index':
            textstat.automated_readability_index(text),
            'dale_chall':
            textstat.dale_chall_readability_score(text),
            'difficult_words':
            textstat.difficult_words(text),
            'linsear_write':
            textstat.linsear_write_formula(text),
            'gunning_fog':
            textstat.gunning_fog(text),
            'text_standard':
            textstat.text_standard(text),
            'n_syllable':
            textstat.syllable_count(text),
            'avg_letter_per_word':
            textstat.avg_letter_per_word(text),
            'avg_sentence_length':
            textstat.avg_sentence_length(text)
        }
    except:
        readability_dict = {
            'flesch_reading_ease': None,
            'smog': None,
            'flesch_kincaid_grade': None,
            'coleman_liau_index': None,
            'automated_readability_index': None,
            'dale_chall': None,
            'difficult_words': None,
            'linsear_write': None,
            'gunning_fog': None,
            'text_standard': None,
            'n_syllable': None,
            'avg_letter_per_word': None,
            'avg_sentence_length': None
        }
    return readability_dict
示例#20
0
    def score(self, strText):
        self.automated_readability_index = textstat.automated_readability_index(
            strText)
        self.str_automated_readability_index = self.grade(
            self.automated_readability_index)

        self.coleman_liau_index = textstat.coleman_liau_index(strText)
        self.str_coleman_liau_index = self.grade(self.coleman_liau_index)

        self.dale_chall_readability_score = textstat.dale_chall_readability_score(
            strText)
        if self.dale_chall_readability_score >= 9.0:
            self.str_dale_chall_readability_score = ' | ' + '13th to 15th grade (college)'
        elif self.dale_chall_readability_score >= 8.0:
            self.str_dale_chall_readability_score = ' | ' + '11th to 12th grade'
        elif self.dale_chall_readability_score >= 7.0:
            self.str_dale_chall_readability_score = ' | ' + '9th to 10th grade'
        elif self.dale_chall_readability_score >= 6.0:
            self.str_dale_chall_readability_score = ' | ' + '7th to 8th grade'
        elif self.dale_chall_readability_score >= 5.0:
            self.str_dale_chall_readability_score = ' | ' + '5th to 6th grade'
        else:
            self.str_dale_chall_readability_score = ' | ' + '4th grade or lower'

        self.difficult_words = textstat.difficult_words(strText)

        self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(strText)
        self.str_flesch_kincaid_grade = self.grade(self.flesch_kincaid_grade)

        self.flesch_reading_ease = textstat.flesch_reading_ease(strText)
        if self.flesch_reading_ease >= 90:
            self.str_flesch_reading_ease = ' | ' + 'Very Easy'
        elif self.flesch_reading_ease >= 80:
            self.str_flesch_reading_ease = ' | ' + 'Easy'
        elif self.flesch_reading_ease >= 70:
            self.str_flesch_reading_ease = ' | ' + 'Fairly Easy'
        elif self.flesch_reading_ease >= 60:
            self.str_flesch_reading_ease = ' | ' + 'Standard'
        elif self.flesch_reading_ease >= 50:
            self.str_flesch_reading_ease = ' | ' + 'Fairly Difficult'
        elif self.flesch_reading_ease >= 30:
            self.str_flesch_reading_ease = ' | ' + 'Difficult'
        else:
            self.str_flesch_reading_ease = ' | ' + 'Very Confusing'

        self.gunning_fog = textstat.gunning_fog(strText)
        self.str_gunning_fog = self.grade(self.gunning_fog)

        self.linsear_write_formula = textstat.linsear_write_formula(strText)
        self.str_linsear_write_formula = self.grade(self.linsear_write_formula)

        self.smog_index = textstat.smog_index(strText)
        self.str_smog_index = self.grade(self.smog_index)

        self.text_standard = textstat.text_standard(strText)
示例#21
0
	def getWordComplexityScore(self,tokens, i):
		# A higher score means a document takes a higher education level to read
		if (i == 1):
			score = textstat.gunning_fog(tokens)
		elif (i == 2):
		# Texts of fewer than 30 sentences are statistically invalid, because the SMOG formula was normed on 30-sentence samples.
		# textstat requires atleast 3 sentences per article for a result.
			score = textstat.smog_index(tokens)
		else:
			score = textstat.flesch_kincaid_grade(tokens)

		return score
示例#22
0
def flesch_kincaid_grade(text):
    """
    Implements the Flesch-Kincaid reading level formula:
    Flesh-Kincaid Grade Level = 0.39*ASL + 11.8*ASW - 15.59
    Here,
        ASL = average sentence length (number of words divided by number of sentences)
        ASW = average word length in syllables (number of syllables divided by number of words)
    :param text: The text
    :return: a grade level from 0-18, where 0 is the easiest, and 18 is the hardest.
    The goal is to aim for a score of 8 to ensure that 80% of American can read/understand it.
    """
    return textstat.flesch_kincaid_grade(text)
 def create_readability_features(self):
     """
     Adds readability features using textstat library.
     Numbers represent grade level needed to understand the text.
     ari: Automated Readability Index
     """
     for df in [self.X_train, self.X_test]:
         df["review_text_readability_flesch_kincaid"] = df[
             "review_text"].apply(
                 lambda x: textstat.flesch_kincaid_grade(x))
         df["review_text_ari"] = df["review_text"].apply(
             lambda x: textstat.automated_readability_index(x))
def calc_readby(sents_series0):
	fogIndex=[]; flesch_kincaid=[]; flesch_readby=[];
	for i0 in range(len(sents_series0)):
		sent0 = sents_series0[i0]
		flesch_readby.append(textstat.flesch_reading_ease(sent0))
		flesch_kincaid.append(textstat.flesch_kincaid_grade(sent0))
		fogIndex.append(textstat.gunning_fog(sent0))
		if i0%10000==0:
			print(i0)

	df_readby = pd.DataFrame({'flesch_readby':flesch_readby, 'flesch_kincaid':flesch_kincaid, 'fogIndex':fogIndex})
	return(df_readby)
示例#25
0
 def generate_score(self, text):
     self.flesch_reading_grade = ts.flesch_reading_ease(text)
     self.flesch_reading_grade_consensus = readability_test_consensus(self.flesch_reading_grade, flesch_ease_grading_system)
     self.flesch_kincaid_grade = ts.flesch_kincaid_grade(text)
     self.flesch_kincaid_grade_consensus = readability_test_consensus(self.flesch_kincaid_grade, us_grade_level_system_age)
     self.dale_chall_grade = ts.dale_chall_readability_score(text)
     self.dale_chall_grade_consensus = readability_test_consensus(self.dale_chall_grade, dale_chall_system)
     self.smog_grade = ts.smog_index(text)
     self.ari_grade = ts.automated_readability_index(text)
     """  self.ari_grade_consensus = readability_test_consensus(self.ari_grade, us_grade_level_system_level) """
     self.coleman_liau_grade = ts.coleman_liau_index(text)
     pass
示例#26
0
    def process(self, df):

        t0 = time()
        print("\n---Generating Readability Features:---\n")

        def lexical_diversity(text):
            words = nltk.tokenize.word_tokenize(text.lower())
            word_count = len(words)
            vocab_size = len(set(words))
            diversity_score = vocab_size / word_count
            return diversity_score

        def get_counts(text, word_list):
            words = nltk.tokenize.word_tokenize(text.lower())
            count = 0
            for word in words:
                if word in word_list:
                    count += 1
            return count

        df['flesch_reading_ease'] = df['articleBody'].map(lambda x: textstat.flesch_reading_ease(x))
        df['smog_index'] = df['articleBody'].map(lambda x: textstat.smog_index(x))
        df['flesch_kincaid_grade'] = df['articleBody'].map(lambda x: textstat.flesch_kincaid_grade(x))
        df['coleman_liau_index'] = df['articleBody'].map(lambda x: textstat.coleman_liau_index(x))
        df['automated_readability_index'] = df['articleBody'].map(lambda x: textstat.automated_readability_index(x))
        df['dale_chall_readability_score'] = df['articleBody'].map(lambda x: textstat.dale_chall_readability_score(x))
        df['difficult_words'] = df['articleBody'].map(lambda x: textstat.difficult_words(x))
        df['linsear_write_formula'] = df['articleBody'].map(lambda x: textstat.linsear_write_formula(x))
        df['gunning_fog'] = df['articleBody'].map(lambda x: textstat.gunning_fog(x))
        df['i_me_myself'] = df['articleBody'].apply(get_counts,args = (['i', 'me', 'myself'],))
        df['punct'] = df['articleBody'].apply(get_counts,args = ([',','.', '!', '?'],))
        df['lexical_diversity'] = df['articleBody'].apply(lexical_diversity)

        feats = ['flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
        'coleman_liau_index', 'automated_readability_index', 
        'dale_chall_readability_score', 'difficult_words', 'linsear_write_formula',
        'gunning_fog', 'i_me_myself', 'punct', 'lexical_diversity'
        ]


        outfilename_xReadable = df[feats].values

        with open('../saved_data/read.pkl', 'wb') as outfile:
            pickle.dump(feats, outfile, -1)
            pickle.dump(outfilename_xReadable, outfile, -1)

        print ('readable features saved in read.pkl')
        
        print('\n---Readability Features is complete---')
        print("Time taken {} seconds\n".format(time() - t0))
        
        return 1
def cal_readability(target, source):
    import pandas as pd
    tf_r_es = [textstat.flesch_reading_ease(t) for t in target]
    tf_k_gs = [textstat.flesch_kincaid_grade(t) for t in target]
    td_c_rs = [textstat.dale_chall_readability_score(t) for t in target]
    
    sf_r_es = [textstat.flesch_reading_ease(t) for t in source]
    sf_k_gs = [textstat.flesch_kincaid_grade(t) for t in source]
    sd_c_rs = [textstat.dale_chall_readability_score(t) for t in source]
    
    diff_r_es = [np.abs(tf_r_es[i] - sf_r_es[i]) for i in range(len(tf_r_es))]
    diff_k_gs = [np.abs(tf_k_gs[i] - sf_k_gs[i]) for i in range(len(tf_k_gs))]
    difd_c_rs = [np.abs(td_c_rs[i] - sd_c_rs[i]) for i in range(len(td_c_rs))]
    
    return {"Flesch ease mean gen": np.mean(tf_r_es), \
            "Flesch ease mean orig": np.mean(sf_r_es), \
            "Flesch ease mean diff": np.mean(diff_r_es), \
            
            "Flesch grade mean gen": np.mean(tf_k_gs), \
            "Flesch grade mean orig": np.mean(sf_k_gs), \
            "Flesch grade mean diff": np.mean(diff_k_gs), \
            
            "Dale Chall Readability V2 mean gen": np.mean(td_c_rs), \
            "Dale Chall Readability V2 mean orig": np.mean(sd_c_rs), \
            "Dale Chall Readability V2 mean diff": np.mean(difd_c_rs), \
           },\
            \
            {"Flesch ease std dev gen": np.std(tf_r_es), \
            "Flesch ease std dev orig": np.std(sf_r_es), \
            "Flesch ease std dev diff": np.std(diff_r_es), \
            
            "Flesch grade std dev gen": np.std(tf_k_gs), \
            "Flesch grade std dev orig": np.std(sf_k_gs), \
            "Flesch grade std dev diff": np.std(diff_k_gs), \
            
            "Dale Chall Readability V2 std dev gen": np.std(td_c_rs),\
            "Dale Chall Readability V2 std dev orig": np.std(sd_c_rs),\
            "Dale Chall Readability V2 std dev diff": np.std(difd_c_rs)\
           }
示例#28
0
 def readability_scores(self, text):
     self.ari = textstat.automated_readability_index(text)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     self.coleman_liau_index = textstat.coleman_liau_index(text)
     self.dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     self.flesch_reading_ease = textstat.flesch_reading_ease(text)
     self.gunning_fog = textstat.gunning_fog(text)
     self.linsear_write_formula = textstat.linsear_write_formula(text)
     self.lix = textstat.lix(text)
     self.rix = textstat.rix(text)
     self.smog_index = textstat.smog_index(text)
     self.text_standard = textstat.text_standard(text)
示例#29
0
def get_readability_stats(text):
    return {
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'smog_index': textstat.smog_index(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'automated_readability_index':
        textstat.automated_readability_index(text),
        'dale_chall_readability_score':
        textstat.dale_chall_readability_score(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'gunning_fog': textstat.gunning_fog(text),
        'text_standard': textstat.text_standard(text, float_output=True),
    }
def analyze(fileName):
    # convert to one long, massive string with spaces so algo can find words
    input_file = Path(fileName).read_text(encoding='utf8')
    input_file = input_file.replace('\n', '')
    input_file = input_file.replace('*', '')
    found = re.findall(r'\[(.*?)\]', input_file)

    input_file = input_file.replace('[', '')
    input_file = input_file.replace(']', '')
    i = 0
    while i < len(found):
        input_file = input_file.replace(found[i], '')
        i += 1

    print('Reading ease is ' + str(textstat.flesch_reading_ease(input_file)))
    print('Score is ' + str(textstat.flesch_kincaid_grade(input_file)))

    #outF.write(fileName[51:] + ': ' + str(textstat.flesch_reading_ease(input_file)) + ' ' + str(textstat.flesch_kincaid_grade(input_file)) + '\n')
    outF.write(fileName[105:] + ' ' +
               str(textstat.flesch_reading_ease(input_file)) + ' ' +
               str(textstat.flesch_kincaid_grade(input_file)) + '\n')

    # 417 political news articles from BBC
    # 386 entertainment news articles from BBC
示例#31
0
def create_readability_plots_per_episode(file_list):
	count = 0
	file_name_list = []
	readability_list = []
	for file in file_list:
		"""Create Dataframe"""
		df_individual = create_dataframe(file)
		all_speech = " ".join(df_individual['Line'].as_matrix())
		readability_list.append(textstat.flesch_kincaid_grade(all_speech))
		# print(df_individual)

		"""Combine all dfs into one big dataframe"""
		if count == 0:
			df = df_individual
			count+=1
		else:
			df = pd.concat([df, df_individual])

	print(readability_list)
	end = len(readability_list) + 1
	x = list(range(1, end))

	sns.barplot( x=x, y = readability_list)
	sns.plt.show()
示例#32
0
def test_flesch_kincaid_grade():
    score = textstat.flesch_kincaid_grade(long_test)

    assert score == 10.0
示例#33
0
main_characters_TOS = ['KIRK', 'SPOCK', 'MCCOY', 'SCOTT', 'UHURA', 'SULU', 'CHEKOV', 'CHAPEL']
# main_characters_TOS = ['KIRK', 'SPOCK', 'MCCOY', 'SCOTT', 'UHURUA', 'SULU', 'CHEKOV', 'CHAPEL', 'RAND', 'SAAVIK', 'SAREK']
main_characters_TNG = ['PICARD', 'RIKER', 'TROI', 'CRUSHER', 'WESLEY', 'DATA', 'LAFORGE', 'WORF', 'PULASKI', 'TASHA', "O'BRIEN", 'BARCLAY', 'RO', 'GUINAN', 'KEIKO', 'LWAXANA', 'Q', 'GOWRON']
# main_characters_TNG = ['PICARD', 'RIKER', 'TROI', 'CRUSHER', 'WESLEY', 'DATA', 'LAFORGE', 'WORF', 'PULASKI', 'TASHA', "O'BRIEN", 'BARCLAY', 'OGAWA', 'RO', 'GUINAN', 'KEIKO', 'LWAXANA', 'Q', 'GOWRON', 'ALEXANDER', 'KURN', 'LURSA', "B'ETOR", 'LORE', "K'EHLEYR", 'TRAVELLER', 'VASH', 'TOMALAK', 'NECHAYEV', 'LEFLER', 'MOT']
main_characters_DS9 = ['SISKO', 'KIRA', 'ODO', 'DAX', 'BASHIR', 'EZRI', 'QUARK', 'JAKE', 'ROM', 'NOG', 'LEETA', 'GARAK', 'KASIDY', 'MARTOK', 'ZIYAL', 'DUKAT', 'WINN', 'WEYOUN', 'FEMALE', 'DAMAR','VIC', 'ZEK', 'BAREIL', 'ISHKA', 'SLOAN', 'OPAKA', 'BRUNT','SHAKAAR']
# main_characters_DS9 = ['SISKO', 'KIRA', 'ODO', 'DAX', 'BASHIR', 'EZRI', 'QUARK', 'JAKE', 'ROM', 'NOG', 'LEETA', 'GARAK', 'KASIDY', 'MARTOK', 'ZIYAL', 'DUKAT', 'ROSS', 'WINN', 'WEYOUN', 'FEMALE', 'DAMAR', 'EDDINGTON', 'VIC', 'ZEK', 'BAREIL', 'ISHKA', 'SLOAN', 'TAIN', 'OPAKA', 'BRUNT', 'JOSEPH', 'SHAKAAR', 'MORA', 'KOR', 'EVEK', 'CRETAK', 'MILA']
main_characters_Voyager = ['JANEWAY', 'CHAKOTAY', 'TUVOK', 'PARIS', 'TORRES', 'KIM', 'SEVEN', 'EMH', 'NEELIX', 'KES', 'SESKA', 'QUEEN']
main_characters_Enterprise = ['ARCHER', "T'POL", 'TUCKER', 'REED', 'TRAVIS', 'HOSHI', 'PHLOX','FORREST', 'SOVAL', 'DANIELS']

main_chars_all = main_characters_TOS + main_characters_TNG + main_characters_DS9 + main_characters_Voyager + main_characters_Enterprise

for char in main_chars_all:
	try:
		speech = " ".join((df[df['Character']==char])['Line'].as_matrix())
		print(char, ':', textstat.flesch_kincaid_grade(speech))
	except:
		continue


# """remove puncuation from lines (after we remove stuff in parentheses)"""
# # df_cleaner = remove_punctuation(df_clean)
# df['Line'] = df['Line'].apply(clean_line)
# df['Character'] = df['Character'].apply(clean_character)
# df['Location'] = df['Location'].apply(clean_location)

# for 

# print(df)

import seaborn as sns


"""Can create graphs of readability across entire seasons or whole series"""

file_list = sys.argv[1:]

count = 0
file_name_list = []
readability_list = []
for file in file_list:
	"""Create Dataframe"""
	df_individual = create_df.create_dataframe(file)
	# df_individual = create_dataframe(file)
	all_speech = " ".join(df_individual['Line'].as_matrix())
	readability_list.append(textstat.flesch_kincaid_grade(all_speech))
	# print(df_individual)

	"""Combine all dfs into one big dataframe"""
	if count == 0:
		df = df_individual
		count+=1
	else:
		df = pd.concat([df, df_individual])

# print(readability_list)
end = len(readability_list) + 1
x = list(range(1, end))

sns.barplot( x=x, y = readability_list)
sns.plt.savefig('figures/Animated_readability_per_episode')