示例#1
0
 def get_readability_scores(self, doc):
     segment = doc.text
     readability_dict = {
         "automated_readability_index":
         textstat.automated_readability_index(segment),
         "coleman_liau_index":
         textstat.coleman_liau_index(segment),
         "dale_chall_readability_score":
         textstat.dale_chall_readability_score(segment),
         "difficult_words":
         textstat.difficult_words(segment),
         "flesch_kincaid_grade":
         textstat.flesch_kincaid_grade(segment),
         "flesch_reading_ease":
         textstat.flesch_reading_ease(segment),
         "gunning_fog":
         textstat.gunning_fog(segment),
         "linsear_write_formula":
         textstat.linsear_write_formula(segment),
         "smog_index":
         textstat.smog_index(segment),
         "text_standard":
         self._convert_text_standard_to_integer(
             textstat.text_standard(segment)),
     }
     return readability_dict
def preprocess_text(text):
    """Takes a text, generate features, and returns as dict

    Args:
        text (str): the text to be preprocessed.

    Returns:
        dict: a dictionary of feature names with associated values

    """
    text = _simplify_punctuation(text)

    features = {
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "smog_index": textstat.smog_index(text),
        "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
        "coleman_liau_index": textstat.coleman_liau_index(text),
        "automated_readability_index":
        textstat.automated_readability_index(text),
        "dale_chall_readability_score":
        textstat.dale_chall_readability_score(text),
        "difficult_words": textstat.difficult_words(text),
        "linsear_write_formula": textstat.linsear_write_formula(text),
        "gunning_fog": textstat.gunning_fog(text),
        "text_standard": textstat.text_standard(text, float_output=True),
        "mean_parse_tree_depth": get_mean_parse_tree_depth(text),
        "mean_ents_per_sentence": get_mean_ents_per_sentence(text),
    }

    features.update(get_mean_pos_tags(text))

    return features
示例#3
0
def parse_comment(subreddit_name, body):
    # raw metrics
    sentences = ts.sentence_count(body)
    words = ts.lexicon_count(body)
    syllables = ts.syllable_count(body)
    trisyllabic = ts.trisyllab_count(body)

    # derived
    fk_grade = ts.flesch_kincaid_grade(body)
    smog = ts.smog_index(body)

    return (sub.display_name, sentences, words, syllables, trisyllabic, fk_grade, smog)
示例#4
0
def get_score(text):
    scores = []
    scores.append((tst.avg_sentence_length(text) - MEAN_SL) / STD_SL)
    scores.append((tst.avg_letter_per_word(text) - MEAN_AL) / STD_AL)
    scores.append(tst.avg_sentence_per_word(text))
    scores.append((tst.sentence_count(text) - MEAN_SC) / STD_SC)
    scores.append((tst.flesch_kincaid_grade(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.flesch_reading_ease(text) - 50) / 50)
    scores.append((tst.smog_index(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.coleman_liau_index(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.automated_readability_index(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.dale_chall_readability_score(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.linsear_write_formula(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.gunning_fog(text) - MEAN_GRADE) / MEAN_GRADE)
    return scores
示例#5
0
文件: views.py 项目: klown/clusive
 def form_valid(self, form):
     text = form.cleaned_data['text']
     word_list = wf.tokenize(text, self.lang)
     self.stats = [
         { 'name': 'Flesch-Kincaid grade level',
           'value':  textstat.flesch_kincaid_grade(text),
           'desc': 'Based on avg sentence length and syllables per word.'},
         { 'name': 'Dale-Chall grade level',
           'value': textstat.dale_chall_readability_score_v2(text),
           'desc': 'Based on avg sentence length and percent difficult words.'},
         { 'name': 'Number of words',
           'value': textstat.lexicon_count(text) },
         { 'name': 'Number of sentences',
           'value': textstat.sentence_count(text) },
         { 'name': 'Average sentence length',
           'value': textstat.avg_sentence_length(text) },
         { 'name': 'Average syllables per word',
           'value': textstat.avg_syllables_per_word(text) },
         { 'name': 'Difficult words',
           'value': "%d (%d%%): %s" % (textstat.difficult_words(text),
                                       100*textstat.difficult_words(text)/textstat.lexicon_count(text),
                                       ', '.join(textstat.difficult_words_list(text))) },
     ]
     word_info = {}
     for word in word_list:
         base = base_form(word)
         w = word_info.get(base)
         if w:
             w['count'] += 1
             if word != base and word not in w['alts']:
                 w['alts'].append(word)
         else:
             w = {
                 'hw' : base,
                 'alts' : [],
                 'count' : 1,
                 'freq' : wf.zipf_frequency(base, self.lang)
             }
             if word != base:
                 w['alts'].append(word)
             word_info[base] = w
     self.words = sorted(word_info.values(), key=lambda x: x.get('freq'))
     logger.debug('words: %s', self.words)
     # Don't do normal process of redirecting to success_url.  Just stay on this form page forever.
     return self.render_to_response(self.get_context_data(form=form))
示例#6
0
 def post(self, args):
     text = args['text']
     readability = {}
     readability["flesch_reading_ease"] = textstat.flesch_reading_ease(text)
     readability["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(
         text)
     readability["smog_index"] = textstat.smog_index(text)
     readability["coleman_liau_index"] = textstat.coleman_liau_index(text)
     readability[
         "automated_readability_index"] = textstat.automated_readability_index(
             text)
     readability[
         "dale_chall_readability_score"] = textstat.dale_chall_readability_score(
             text)
     readability["linsear_write_formula"] = textstat.linsear_write_formula(
         text)
     readability["gunning_fog"] = textstat.gunning_fog(text)
     readability["text_standard"] = textstat.text_standard(text)
     readability["difficult_words"] = textstat.difficult_words(text)
     return jsonify(readability)
示例#7
0
 def get_delta(self):
     return abs(
         textstat.flesch_kincaid_grade(self.input_data) -
         textstat.flesch_kincaid_grade(self.output_data))
示例#8
0
    sentences = sent_tokenize(text)
    words_per_sentence = [word_tokenize(sent) for sent in sentences]
    non_space_chars = re.sub(space_special_chars, '', text)

    words_len = pd.Series([len(word) for word in words])
    sentences_len = pd.Series([len(sent) for sent in sentences])
    len_words_per_sentence = pd.Series([len(wps) for wps in words_per_sentence])


    word_stats = words_len.describe()
    word_stats.index = ['Word ' + i for i in word_stats.index]

    sent_stats = sentences_len.describe()
    sent_stats.index = ['Sentence ' + i for i in sent_stats.index]

    wps_stats = len_words_per_sentence.describe()
    wps_stats.index = ['Words per sentences ' + i for i in wps_stats.index]
    info_dict['Name'] = name
    info_dict['Total characters'] = len(non_space_chars)
    #info_dict['Total sentences'] = len(sentences)
    info_dict.update(word_stats.to_dict())
    info_dict.update(sent_stats.to_dict())
    info_dict.update(wps_stats.to_dict())
    info_dict['Flesch-Kincaid'] = textstat.flesch_kincaid_grade(text)
    info_dict['Gunning fog'] = textstat.gunning_fog(text)
    info_dict['SMOG'] = textstat.smog_index(text)

    info_dicts.append(info_dict)

df = pd.DataFrame(info_dicts)
df.to_csv(r'C:\Users\Krista\DocumentsRE _Call_re_potential_matter\code_results_contents_removed.csv')
示例#9
0
from textstat import textstat
if __name__ == '__main__':
    test_data = 'The quick brown fox jumps over the lazy dog'

#File to be used to test the function
print(textstat.flesch_reading_ease(test_data))
print(textstat.smog_index(test_data))
print(textstat.flesch_kincaid_grade(test_data))
print(textstat.coleman_liau_index(test_data))
print(textstat.automated_readability_index(test_data))
print(textstat.dale_chall_readability_score(test_data))
print(textstat.difficult_words(test_data))
print(textstat.linsear_write_formula(test_data))
print(textstat.gunning_fog(test_data))
print(textstat.text_standard(test_data))
示例#10
0
文件: main.py 项目: twango-dev/probr
def index():
    data = request.json
    print(f'Debug: {data}')

    unique_id = data['unique_id']
    process_language = data['process_language']
    message = data['message']

    matches_list = None
    if process_language:
        # Language tool takes a while to process
        language_tool = LanguageTool('en-US')
        matches: list[Match] = language_tool.check(message)

        matches_list = []
        for match in matches:
            matches_list.append(match_to_dict(match))
        print(f'Analysis finished: {matches_list}')

    sentences: list = splitter.split(text=message)

    return {
        'unique_id': unique_id,
        'text_statistics': {
            'lexicon_count': textstat.lexicon_count(message),
            'lexicon_count_ps': list_map(sentences, textstat.lexicon_count),
            'syllable_count': textstat.syllable_count(message),
            'syllable_count_ps': list_map(sentences, textstat.syllable_count),
            'sentences': sentences,
            'sentence_count': len(sentences),
            'readability': {
                'flesch_reading_ease': {
                    'score': textstat.flesch_reading_ease(message),
                    'sps': list_map(sentences, textstat.flesch_reading_ease)
                },
                'smog_index': {
                    'score': textstat.smog_index(message)
                },
                'flesch_kincaid_grade': {
                    'score': textstat.flesch_kincaid_grade(message),
                    'sps': list_map(sentences, textstat.flesch_kincaid_grade)
                },
                'coleman_liau_index': {
                    'score': textstat.coleman_liau_index(message),
                    'sps': list_map(sentences, textstat.coleman_liau_index)
                },
                'automated_readability_index': {
                    'score':
                    textstat.automated_readability_index(message),
                    'sps':
                    list_map(sentences, textstat.automated_readability_index)
                },
                'dale_chall_readability_score': {
                    'score':
                    textstat.dale_chall_readability_score(message),
                    'sps':
                    list_map(sentences, textstat.dale_chall_readability_score)
                },
                'difficult_words': {
                    'score': textstat.difficult_words(message),
                    'sps': list_map(sentences, textstat.difficult_words),
                    'words': textstat.difficult_words_list(message)
                },
                'linsear_write_formula': {
                    'score': round(textstat.linsear_write_formula(message), 2),
                    'sps': list_map(sentences, textstat.linsear_write_formula)
                },
                'gunning_fog': {
                    'score': textstat.gunning_fog(message),
                    'sps': list_map(sentences, textstat.gunning_fog)
                },
                'text_standard': {
                    'score': textstat.text_standard(message)
                }
            }
        },
        'language_tool': matches_list
    }