def preprocess_text(text):
    """Takes a text, generate features, and returns as dict

    Args:
        text (str): the text to be preprocessed.

    Returns:
        dict: a dictionary of feature names with associated values

    """
    text = _simplify_punctuation(text)

    features = {
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "smog_index": textstat.smog_index(text),
        "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
        "coleman_liau_index": textstat.coleman_liau_index(text),
        "automated_readability_index":
        textstat.automated_readability_index(text),
        "dale_chall_readability_score":
        textstat.dale_chall_readability_score(text),
        "difficult_words": textstat.difficult_words(text),
        "linsear_write_formula": textstat.linsear_write_formula(text),
        "gunning_fog": textstat.gunning_fog(text),
        "text_standard": textstat.text_standard(text, float_output=True),
        "mean_parse_tree_depth": get_mean_parse_tree_depth(text),
        "mean_ents_per_sentence": get_mean_ents_per_sentence(text),
    }

    features.update(get_mean_pos_tags(text))

    return features
예제 #2
0
 def get_readability_scores(self, doc):
     segment = doc.text
     readability_dict = {
         "automated_readability_index":
         textstat.automated_readability_index(segment),
         "coleman_liau_index":
         textstat.coleman_liau_index(segment),
         "dale_chall_readability_score":
         textstat.dale_chall_readability_score(segment),
         "difficult_words":
         textstat.difficult_words(segment),
         "flesch_kincaid_grade":
         textstat.flesch_kincaid_grade(segment),
         "flesch_reading_ease":
         textstat.flesch_reading_ease(segment),
         "gunning_fog":
         textstat.gunning_fog(segment),
         "linsear_write_formula":
         textstat.linsear_write_formula(segment),
         "smog_index":
         textstat.smog_index(segment),
         "text_standard":
         self._convert_text_standard_to_integer(
             textstat.text_standard(segment)),
     }
     return readability_dict
예제 #3
0
 def encode(self, sentence: str) -> np.ndarray:
     if not isinstance(sentence, str):
         sentence = ''
     return np.array([
         textstat.flesch_reading_ease(sentence),
         textstat.syllable_count(sentence),
         textstat.text_standard(sentence, float_output=True),
         textstat.syllable_count(sentence),
         textstat.lexicon_count(sentence, removepunct=True)
     ])
예제 #4
0
 def level_score(self, text):
     score = textstat.text_standard(text)
     grade = int(score[0])
     levels = {
         0: "A1 - Low",
         1: "A1 - High",
         2: "A2 - Low",
         3: "A2 - High",
         4: "B1 - Low",
         5: "B1 - High",
         6: "B2 - Low",
         7: "B2 - High",
         8: "C1 - Low",
         9: "C1 - High",
         10: "C2 - Low",
         11: "C2 - High",
     }
     return levels[grade]
예제 #5
0
 def post(self, args):
     text = args['text']
     readability = {}
     readability["flesch_reading_ease"] = textstat.flesch_reading_ease(text)
     readability["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(
         text)
     readability["smog_index"] = textstat.smog_index(text)
     readability["coleman_liau_index"] = textstat.coleman_liau_index(text)
     readability[
         "automated_readability_index"] = textstat.automated_readability_index(
             text)
     readability[
         "dale_chall_readability_score"] = textstat.dale_chall_readability_score(
             text)
     readability["linsear_write_formula"] = textstat.linsear_write_formula(
         text)
     readability["gunning_fog"] = textstat.gunning_fog(text)
     readability["text_standard"] = textstat.text_standard(text)
     readability["difficult_words"] = textstat.difficult_words(text)
     return jsonify(readability)
예제 #6
0
 def metadata_function(self, language, text):
     if not isinstance(text, str) or language != 'en':
         return float('nan')
     return textstat.text_standard(text, True)
예제 #7
0
from textstat import textstat
if __name__ == '__main__':
    test_data = 'The quick brown fox jumps over the lazy dog'

#File to be used to test the function
print(textstat.flesch_reading_ease(test_data))
print(textstat.smog_index(test_data))
print(textstat.flesch_kincaid_grade(test_data))
print(textstat.coleman_liau_index(test_data))
print(textstat.automated_readability_index(test_data))
print(textstat.dale_chall_readability_score(test_data))
print(textstat.difficult_words(test_data))
print(textstat.linsear_write_formula(test_data))
print(textstat.gunning_fog(test_data))
print(textstat.text_standard(test_data))
예제 #8
0
파일: main.py 프로젝트: twango-dev/probr
def index():
    data = request.json
    print(f'Debug: {data}')

    unique_id = data['unique_id']
    process_language = data['process_language']
    message = data['message']

    matches_list = None
    if process_language:
        # Language tool takes a while to process
        language_tool = LanguageTool('en-US')
        matches: list[Match] = language_tool.check(message)

        matches_list = []
        for match in matches:
            matches_list.append(match_to_dict(match))
        print(f'Analysis finished: {matches_list}')

    sentences: list = splitter.split(text=message)

    return {
        'unique_id': unique_id,
        'text_statistics': {
            'lexicon_count': textstat.lexicon_count(message),
            'lexicon_count_ps': list_map(sentences, textstat.lexicon_count),
            'syllable_count': textstat.syllable_count(message),
            'syllable_count_ps': list_map(sentences, textstat.syllable_count),
            'sentences': sentences,
            'sentence_count': len(sentences),
            'readability': {
                'flesch_reading_ease': {
                    'score': textstat.flesch_reading_ease(message),
                    'sps': list_map(sentences, textstat.flesch_reading_ease)
                },
                'smog_index': {
                    'score': textstat.smog_index(message)
                },
                'flesch_kincaid_grade': {
                    'score': textstat.flesch_kincaid_grade(message),
                    'sps': list_map(sentences, textstat.flesch_kincaid_grade)
                },
                'coleman_liau_index': {
                    'score': textstat.coleman_liau_index(message),
                    'sps': list_map(sentences, textstat.coleman_liau_index)
                },
                'automated_readability_index': {
                    'score':
                    textstat.automated_readability_index(message),
                    'sps':
                    list_map(sentences, textstat.automated_readability_index)
                },
                'dale_chall_readability_score': {
                    'score':
                    textstat.dale_chall_readability_score(message),
                    'sps':
                    list_map(sentences, textstat.dale_chall_readability_score)
                },
                'difficult_words': {
                    'score': textstat.difficult_words(message),
                    'sps': list_map(sentences, textstat.difficult_words),
                    'words': textstat.difficult_words_list(message)
                },
                'linsear_write_formula': {
                    'score': round(textstat.linsear_write_formula(message), 2),
                    'sps': list_map(sentences, textstat.linsear_write_formula)
                },
                'gunning_fog': {
                    'score': textstat.gunning_fog(message),
                    'sps': list_map(sentences, textstat.gunning_fog)
                },
                'text_standard': {
                    'score': textstat.text_standard(message)
                }
            }
        },
        'language_tool': matches_list
    }