def preprocess_text(text): """Takes a text, generate features, and returns as dict Args: text (str): the text to be preprocessed. Returns: dict: a dictionary of feature names with associated values """ text = _simplify_punctuation(text) features = { "flesch_reading_ease": textstat.flesch_reading_ease(text), "smog_index": textstat.smog_index(text), "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text), "coleman_liau_index": textstat.coleman_liau_index(text), "automated_readability_index": textstat.automated_readability_index(text), "dale_chall_readability_score": textstat.dale_chall_readability_score(text), "difficult_words": textstat.difficult_words(text), "linsear_write_formula": textstat.linsear_write_formula(text), "gunning_fog": textstat.gunning_fog(text), "text_standard": textstat.text_standard(text, float_output=True), "mean_parse_tree_depth": get_mean_parse_tree_depth(text), "mean_ents_per_sentence": get_mean_ents_per_sentence(text), } features.update(get_mean_pos_tags(text)) return features
def get_readability_scores(self, doc): segment = doc.text readability_dict = { "automated_readability_index": textstat.automated_readability_index(segment), "coleman_liau_index": textstat.coleman_liau_index(segment), "dale_chall_readability_score": textstat.dale_chall_readability_score(segment), "difficult_words": textstat.difficult_words(segment), "flesch_kincaid_grade": textstat.flesch_kincaid_grade(segment), "flesch_reading_ease": textstat.flesch_reading_ease(segment), "gunning_fog": textstat.gunning_fog(segment), "linsear_write_formula": textstat.linsear_write_formula(segment), "smog_index": textstat.smog_index(segment), "text_standard": self._convert_text_standard_to_integer( textstat.text_standard(segment)), } return readability_dict
def encode(self, sentence: str) -> np.ndarray: if not isinstance(sentence, str): sentence = '' return np.array([ textstat.flesch_reading_ease(sentence), textstat.syllable_count(sentence), textstat.text_standard(sentence, float_output=True), textstat.syllable_count(sentence), textstat.lexicon_count(sentence, removepunct=True) ])
def level_score(self, text): score = textstat.text_standard(text) grade = int(score[0]) levels = { 0: "A1 - Low", 1: "A1 - High", 2: "A2 - Low", 3: "A2 - High", 4: "B1 - Low", 5: "B1 - High", 6: "B2 - Low", 7: "B2 - High", 8: "C1 - Low", 9: "C1 - High", 10: "C2 - Low", 11: "C2 - High", } return levels[grade]
def post(self, args): text = args['text'] readability = {} readability["flesch_reading_ease"] = textstat.flesch_reading_ease(text) readability["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade( text) readability["smog_index"] = textstat.smog_index(text) readability["coleman_liau_index"] = textstat.coleman_liau_index(text) readability[ "automated_readability_index"] = textstat.automated_readability_index( text) readability[ "dale_chall_readability_score"] = textstat.dale_chall_readability_score( text) readability["linsear_write_formula"] = textstat.linsear_write_formula( text) readability["gunning_fog"] = textstat.gunning_fog(text) readability["text_standard"] = textstat.text_standard(text) readability["difficult_words"] = textstat.difficult_words(text) return jsonify(readability)
def metadata_function(self, language, text): if not isinstance(text, str) or language != 'en': return float('nan') return textstat.text_standard(text, True)
from textstat import textstat if __name__ == '__main__': test_data = 'The quick brown fox jumps over the lazy dog' #File to be used to test the function print(textstat.flesch_reading_ease(test_data)) print(textstat.smog_index(test_data)) print(textstat.flesch_kincaid_grade(test_data)) print(textstat.coleman_liau_index(test_data)) print(textstat.automated_readability_index(test_data)) print(textstat.dale_chall_readability_score(test_data)) print(textstat.difficult_words(test_data)) print(textstat.linsear_write_formula(test_data)) print(textstat.gunning_fog(test_data)) print(textstat.text_standard(test_data))
def index(): data = request.json print(f'Debug: {data}') unique_id = data['unique_id'] process_language = data['process_language'] message = data['message'] matches_list = None if process_language: # Language tool takes a while to process language_tool = LanguageTool('en-US') matches: list[Match] = language_tool.check(message) matches_list = [] for match in matches: matches_list.append(match_to_dict(match)) print(f'Analysis finished: {matches_list}') sentences: list = splitter.split(text=message) return { 'unique_id': unique_id, 'text_statistics': { 'lexicon_count': textstat.lexicon_count(message), 'lexicon_count_ps': list_map(sentences, textstat.lexicon_count), 'syllable_count': textstat.syllable_count(message), 'syllable_count_ps': list_map(sentences, textstat.syllable_count), 'sentences': sentences, 'sentence_count': len(sentences), 'readability': { 'flesch_reading_ease': { 'score': textstat.flesch_reading_ease(message), 'sps': list_map(sentences, textstat.flesch_reading_ease) }, 'smog_index': { 'score': textstat.smog_index(message) }, 'flesch_kincaid_grade': { 'score': textstat.flesch_kincaid_grade(message), 'sps': list_map(sentences, textstat.flesch_kincaid_grade) }, 'coleman_liau_index': { 'score': textstat.coleman_liau_index(message), 'sps': list_map(sentences, textstat.coleman_liau_index) }, 'automated_readability_index': { 'score': textstat.automated_readability_index(message), 'sps': list_map(sentences, textstat.automated_readability_index) }, 'dale_chall_readability_score': { 'score': textstat.dale_chall_readability_score(message), 'sps': list_map(sentences, textstat.dale_chall_readability_score) }, 'difficult_words': { 'score': textstat.difficult_words(message), 'sps': list_map(sentences, textstat.difficult_words), 'words': textstat.difficult_words_list(message) }, 'linsear_write_formula': { 'score': round(textstat.linsear_write_formula(message), 2), 'sps': list_map(sentences, textstat.linsear_write_formula) }, 'gunning_fog': { 'score': textstat.gunning_fog(message), 'sps': list_map(sentences, textstat.gunning_fog) }, 'text_standard': { 'score': textstat.text_standard(message) } } }, 'language_tool': matches_list }