def lex_readability(self, text, mode='fre'):

        if mode == 'all':
            fre_score = textstat.flesch_reading_ease(text)
            fog_index = textstat.gunning_fog(text)
            fkg_index = textstat.flesch_kincaid_grade(text)
            dcr_score = textstat.dale_chall_readability_score(text)
            text_standard = textstat.text_standard(text, float_output=True)
            return fre_score, fog_index, fkg_index, dcr_score, text_standard

        if mode == 'fre':
            fre_score = textstat.flesch_reading_ease(text)
            return fre_score

        if mode == 'fog':
            fog_index = textstat.gunning_fog(text)
            return fog_index

        if mode == 'fkg':
            fkg_index = textstat.flesch_kincaid_grade(text)
            return fkg_index

        if mode == 'dcr':
            dcr_score = textstat.dale_chall_readability_score(text)
            return dcr_score

        if mode == 'text_std':
            text_standard = textstat.text_standard(text, float_output=True)
            return text_standard
示例#2
0
def test_text_standard():
    standard = textstat.text_standard(long_test)

    assert standard == "9th and 10th grade"

    standard = textstat.text_standard(short_test)

    assert standard == "2nd and 3rd grade"
示例#3
0
def test_text_standard():
    standard = textstat.text_standard(long_test)

    assert standard == "9th and 10th grade"

    standard = textstat.text_standard(short_test)

    assert standard == "2nd and 3rd grade"
示例#4
0
    def test_text_standard(self):
        standard = textstat.text_standard(self.long_test)

        self.assertEqual("9th and 10th grade", standard)

        standard = textstat.text_standard(self.short_test)

        self.assertEqual("2nd and 3rd grade", standard)
示例#5
0
def test_text_standard():
    textstat.set_lang("en_US")
    standard = textstat.text_standard(long_test)

    assert standard == "9th and 10th grade"

    standard = textstat.text_standard(short_test)

    assert standard == "2nd and 3rd grade"
示例#6
0
文件: DE_main.py 项目: zzs-NLP/ACS-QG
def get_readibility(text, metric="flesch_kincaid_grade"):
    """
    Return a score which reveals a piece of text's readability level.
    Reference: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html
               https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    """
    if metric == "flesch_kincaid_grade":
        result = textstat.flesch_kincaid_grade(text)
    elif metric == "flesch_reading_ease":
        result = textstat.flesch_reading_ease(text)
    elif metric == "smog_index":
        result = textstat.smog_index(text)
    elif metric == "coleman_liau_index":
        result = textstat.coleman_liau_index(text)
    elif metric == "automated_readability_index":
        result = textstat.automated_readability_index(text)
    elif metric == "dale_chall_readability_score":
        result = textstat.dale_chall_readability_score(text)
    elif metric == "difficult_words":
        result = textstat.difficult_words(text)
    elif metric == "linsear_write_formula":
        result = textstat.linsear_write_formula(text)
    elif metric == "gunning_fog":
        result = textstat.gunning_fog(text)
    elif metric == "text_standard":
        result = textstat.text_standard(text)
    else:
        print("ERROR: Please select correct metric!")
        result = None
    return result
示例#7
0
 def check_difficulty(self):
     text = self.textoutput
     #0-30 = college
     #50-60 = high school
     #60+ = middle school/elementary school
     try:
         grade_level = textstat.text_standard(text)
         reading_ease = textstat.flesch_reading_ease(text)  #requires chart
         sentence_count = textstat.sentence_count(text)
         difficult_words = self.get_difficult_words(text)
         replacement_words = self.get_replacement_words(difficult_words)
         output = "Grade Level of Input Text: " + grade_level + "\n"
         #output = output + "Ease of Reading*: " + str(reading_ease) + "\n"
         output = output + "Sentence Count: " + str(sentence_count) + "\n"
         output = output + "Difficult Words Found: " + str(
             len(difficult_words)) + "\n"
         output = output + "Possible Replacements: " + "\n"
         for dw in replacement_words:
             output = output + dw + " -> "
             for word in replacement_words[dw]:
                 output = output + word + ", "
             output = output + "\n"
         self.difficultyReport = output
     except:
         self.difficultyReport = "Error determining Difficulties"
示例#8
0
def calculate_stats(data_folder):
    """Calculate stat of test.json file in a folder"""
    data_folder = Path(data_folder)
    for dataset in dataset_fields:
        print(f"loading {dataset}")
        field = dataset_fields[dataset]["text"].strip()
        sentences = []
        for item in json.load(open(data_folder / dataset / "test.json")):
            sentences.append(item[field][-1] if type(item[field]) == list else item[field])

        text = " ".join(sentences)
        lex_count = textstat.lexicon_count(text)
        print(lex_count)
        unique_words = count_words(text)
        print(f"all unique {len(unique_words)}")

        lower_unique_words = count_words(text, casing="lower")
        print(f"lowercase unique {len(lower_unique_words)}")

        upper_unique_words = count_words(text, casing="upper")
        print(f"uppercase unique {len(upper_unique_words)}")

        print(f"ratio {len(upper_unique_words) / len(unique_words)}")

        text_standard = textstat.text_standard(text, float_output=True)
        print(f"text_standard: {text_standard}")

        dale_chall_readability_score = textstat.dale_chall_readability_score(text)
        print(f"dale_chall_readability_score: {dale_chall_readability_score}")

        flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
        print(f"flesch_kincaid_grade: {flesch_kincaid_grade}")
示例#9
0
def analyze():
    print(request)
    str_to_read = request.data.decode("utf-8").strip()

    report = {
        "flesch-reading-ease":
        textstat.flesch_reading_ease(str_to_read),
        "smog-index":
        textstat.smog_index(str_to_read),
        "flesch-kincaid-grade":
        textstat.flesch_kincaid_grade(str_to_read),
        "coleman-liau-index":
        textstat.coleman_liau_index(str_to_read),
        "automated-readability-index":
        textstat.automated_readability_index(str_to_read),
        "dale-chall-readability-score":
        textstat.dale_chall_readability_score(str_to_read),
        "difficult-words":
        textstat.difficult_words(str_to_read),
        "linsear-write-formula":
        textstat.linsear_write_formula(str_to_read),
        "gunning-fog":
        textstat.gunning_fog(str_to_read),
        "text-standard":
        textstat.text_standard(str_to_read)
    }
    return decorate_response(jsonify(report))
示例#10
0
def readability(queries):
    scores = pd.DataFrame(columns=[
        'Flesch', 'Smog', 'Flesch grade', 'Coleman', 'Automated', 'Dale',
        'Difficult', 'Linsear', 'Gunning', 'Text Standard'
    ])

    scores = {
        'Flesch': [],
        'Smog': [],
        'Flesch grade': [],
        'Coleman': [],
        'Automated': [],
        'Dale': [],
        'Difficult': [],
        'Linsear': [],
        'Gunning': [],
        'Text Standard': []
    }
    for line in queries:
        # results = readability.getmeasures(line, lang='en')
        # frescores.append(results['readability grades']['FleschReadingEase'])
        # line = 'yao family wines . yao family wines is a napa valley producer founded in 2011 by yao ming , the chinese-born , five-time nba all star . now retired from the houston rockets , yao ming is the majority owner in yao family wines , which has entered the wine market with a luxury cabernet sauvignon sourced from napa valley vineyards .'
        scores['Flesch'].append(textstat.flesch_reading_ease(line))
        scores['Smog'].append(textstat.smog_index(line))
        scores['Flesch grade'].append(textstat.flesch_kincaid_grade(line))
        scores['Coleman'].append(textstat.coleman_liau_index(line))
        scores['Automated'].append(textstat.automated_readability_index(line))
        scores['Dale'].append(textstat.dale_chall_readability_score(line))
        scores['Difficult'].append(textstat.difficult_words(line))
        scores['Linsear'].append(textstat.linsear_write_formula(line))
        scores['Gunning'].append(textstat.gunning_fog(line))
        scores['Text Standard'].append(
            textstat.text_standard(line, float_output=True))

    return scores
示例#11
0
    def evaluate(self, text: str):
        extracted_emotions = self._emotion_detector.extract_emotions_from_raw_text(
            text)
        extracted_sentiment = self._google_nlp.extract_sentiment_from_raw_text(
            text)

        # self._ease_mapper[(round(textstat.flesch_reading_ease(text)) - 1) // 10],
        return {
            'clarity':
            5,
            'text_general_level':
            self._general_level_mapper[textstat.text_standard(
                text, float_output=True)],
            'diversity':
            self._uniqueness_mapper[self.text_uniqueness(text) * 10],
            'tone':
            self.emotion_converter(extracted_emotions),
            'emotional_tones':
            list(extracted_emotions.keys()),
            'speech_sentiment':
            extracted_sentiment['sentiment'] *
            10 if 'sentiment' in extracted_sentiment else 5,
            'engagement':
            extracted_emotions['magnitude']
            if 'magnitude' in extracted_emotions else 5,
            'who_do_you_look_like':
            self.who_do_you_look_like(extracted_emotions),
            'calmness':
            random.randint(5, 9)
        }
    def transform(self, X):
        """
        Transform X into a new dataset, Xprime and return it.
        """
        X = pd.DataFrame(X)

        def countCaps(comment):
            count = 0
            for c in comment:
                if c.isupper():
                    count += 1
            return round(count * 100 / len(comment), 2)

        X['%OfUpperCaseLetters'] = X['Comment'].apply(countCaps)

        pattern = 'https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}'
        X['NoOfURL'] = X['Comment'].apply(
            lambda x: len(re.findall(pattern, x)))

        X['NoOfWords'] = X['Comment'].apply(lambda x: (len(word_tokenize(x))))

        X['AvgSentenceLength'] = X['Comment'].apply(
            lambda x: textstat.avg_sentence_length(x))

        X['TextStandard'] = X['Comment'].apply(
            lambda x: textstat.text_standard(x, float_output=True))

        self.X = X
        return self.X
示例#13
0
def getReadabilityMetrics(test_data):
    '''
        for a given article IN TEXT FORMAT, returns its readability metrics
        Uses textstat library, please install it
    '''
    metric = {
        "flesch_reading_ease":
        textstat.flesch_reading_ease(test_data),
        "smog_index":
        textstat.smog_index(test_data),
        "flesch_kincaid_grade":
        textstat.flesch_kincaid_grade(test_data),
        "coleman_liau_index":
        textstat.coleman_liau_index(test_data),
        "automated_readability_index":
        textstat.automated_readability_index(test_data),
        "dale_chall_readability_score":
        textstat.dale_chall_readability_score(test_data),
        "difficult_words":
        textstat.difficult_words(test_data),
        "linsear_write_formula":
        textstat.linsear_write_formula(test_data),
        "gunning_fog":
        textstat.gunning_fog(test_data),
        "text_standard":
        textstat.text_standard(test_data)
    }
    return metric
示例#14
0
def get_stats(text):
    fre = textstat.flesch_reading_ease(text)
    smog = textstat.smog_index(text)
    fkg = textstat.flesch_kincaid_grade(text)
    cli = textstat.coleman_liau_index(text)
    ari = textstat.automated_readability_index(text)
    dcr = textstat.dale_chall_readability_score(text)
    diff_words = textstat.difficult_words(text)
    lwf = textstat.linsear_write_formula(text)
    gunn_fog = textstat.gunning_fog(text)
    consolidated_score = textstat.text_standard(text)

    doc_length = len(text)  # think about excluding spaces?
    quote_count = text.count('"')

    stats = {
        "flesch_reading_ease": fre,
        "smog_index": smog,
        "flesch_kincaid_grade": fkg,
        "coleman_liau_index": cli,
        "automated_readability_index": ari,
        "dale_chall_readability_score": dcr,
        "difficult_words": diff_words,
        "linsear_write_formula": lwf,
        "gunning_fog": gunn_fog,
        "consolidated_score": consolidated_score,
        "doc_length": doc_length,
        "quote_count": quote_count
    }
    return stats
示例#15
0
def textStatistics(text):
    """
    returns text statistics such as lexicon count and text standard in a tuple
    """
    le_c = textstat.lexicon_count(text, removepunct=True)
    ts = textstat.text_standard(text, float_output=True)

    return le_c, ts
示例#16
0
 def reading_standard(text):
     x = textstat.text_standard(text)
     match = re.search(r'(.?\d+)th(\s\w{3}\s((.?\d+)))?', x)
     r_stan = []
     if match:
         r_stan.append(match.group(1))
         r_stan.append(match.group(3))
     return r_stan
示例#17
0
 def getReadability(df):
     import textstat
     df['ARI'] = df.headline_text.apply(
         lambda x: textstat.automated_readability_index(x))
     df['DCR'] = df.headline_text.apply(
         lambda x: textstat.dale_chall_readability_score(x))
     df['TS'] = df.headline_text.apply(
         lambda x: textstat.text_standard(x, float_output=True))
     return df
示例#18
0
    def score(self, strText):
        self.automated_readability_index = textstat.automated_readability_index(
            strText)
        self.str_automated_readability_index = self.grade(
            self.automated_readability_index)

        self.coleman_liau_index = textstat.coleman_liau_index(strText)
        self.str_coleman_liau_index = self.grade(self.coleman_liau_index)

        self.dale_chall_readability_score = textstat.dale_chall_readability_score(
            strText)
        if self.dale_chall_readability_score >= 9.0:
            self.str_dale_chall_readability_score = ' | ' + '13th to 15th grade (college)'
        elif self.dale_chall_readability_score >= 8.0:
            self.str_dale_chall_readability_score = ' | ' + '11th to 12th grade'
        elif self.dale_chall_readability_score >= 7.0:
            self.str_dale_chall_readability_score = ' | ' + '9th to 10th grade'
        elif self.dale_chall_readability_score >= 6.0:
            self.str_dale_chall_readability_score = ' | ' + '7th to 8th grade'
        elif self.dale_chall_readability_score >= 5.0:
            self.str_dale_chall_readability_score = ' | ' + '5th to 6th grade'
        else:
            self.str_dale_chall_readability_score = ' | ' + '4th grade or lower'

        self.difficult_words = textstat.difficult_words(strText)

        self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(strText)
        self.str_flesch_kincaid_grade = self.grade(self.flesch_kincaid_grade)

        self.flesch_reading_ease = textstat.flesch_reading_ease(strText)
        if self.flesch_reading_ease >= 90:
            self.str_flesch_reading_ease = ' | ' + 'Very Easy'
        elif self.flesch_reading_ease >= 80:
            self.str_flesch_reading_ease = ' | ' + 'Easy'
        elif self.flesch_reading_ease >= 70:
            self.str_flesch_reading_ease = ' | ' + 'Fairly Easy'
        elif self.flesch_reading_ease >= 60:
            self.str_flesch_reading_ease = ' | ' + 'Standard'
        elif self.flesch_reading_ease >= 50:
            self.str_flesch_reading_ease = ' | ' + 'Fairly Difficult'
        elif self.flesch_reading_ease >= 30:
            self.str_flesch_reading_ease = ' | ' + 'Difficult'
        else:
            self.str_flesch_reading_ease = ' | ' + 'Very Confusing'

        self.gunning_fog = textstat.gunning_fog(strText)
        self.str_gunning_fog = self.grade(self.gunning_fog)

        self.linsear_write_formula = textstat.linsear_write_formula(strText)
        self.str_linsear_write_formula = self.grade(self.linsear_write_formula)

        self.smog_index = textstat.smog_index(strText)
        self.str_smog_index = self.grade(self.smog_index)

        self.text_standard = textstat.text_standard(strText)
示例#19
0
def compute_readability_stats(text):
    """
    Compute reading statistics of the given text
    Reference: https://github.com/shivam5992/textstat

    Parameters
    ==========
    text: str, input section or abstract text
    """
    try:
        readability_dict = {
            'flesch_reading_ease':
            textstat.flesch_reading_ease(text),
            'smog':
            textstat.smog_index(text),
            'flesch_kincaid_grade':
            textstat.flesch_kincaid_grade(text),
            'coleman_liau_index':
            textstat.coleman_liau_index(text),
            'automated_readability_index':
            textstat.automated_readability_index(text),
            'dale_chall':
            textstat.dale_chall_readability_score(text),
            'difficult_words':
            textstat.difficult_words(text),
            'linsear_write':
            textstat.linsear_write_formula(text),
            'gunning_fog':
            textstat.gunning_fog(text),
            'text_standard':
            textstat.text_standard(text),
            'n_syllable':
            textstat.syllable_count(text),
            'avg_letter_per_word':
            textstat.avg_letter_per_word(text),
            'avg_sentence_length':
            textstat.avg_sentence_length(text)
        }
    except:
        readability_dict = {
            'flesch_reading_ease': None,
            'smog': None,
            'flesch_kincaid_grade': None,
            'coleman_liau_index': None,
            'automated_readability_index': None,
            'dale_chall': None,
            'difficult_words': None,
            'linsear_write': None,
            'gunning_fog': None,
            'text_standard': None,
            'n_syllable': None,
            'avg_letter_per_word': None,
            'avg_sentence_length': None
        }
    return readability_dict
示例#20
0
文件: utils.py 项目: RTXteam/RTX
 def _call_textstat(desc):
     '''
     Get an estimated school grade level required to understand the text from textstat package (reference: https://github.com/shivam5992/textstat)
     parameter desc[string]: description text
     '''
     if type(desc) is str:
         res = textstat.text_standard(desc)
         grade = int(re.sub('[a-z]', '', res.split(' ')[0]))
         return grade
     else:
         raise TypeError(f"'desc' should be str but {type(desc)} detected")
示例#21
0
 def readability_scores(self, text):
     self.ari = textstat.automated_readability_index(text)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     self.coleman_liau_index = textstat.coleman_liau_index(text)
     self.dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     self.flesch_reading_ease = textstat.flesch_reading_ease(text)
     self.gunning_fog = textstat.gunning_fog(text)
     self.linsear_write_formula = textstat.linsear_write_formula(text)
     self.lix = textstat.lix(text)
     self.rix = textstat.rix(text)
     self.smog_index = textstat.smog_index(text)
     self.text_standard = textstat.text_standard(text)
示例#22
0
def get_readability_stats(text):
    return {
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'smog_index': textstat.smog_index(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'automated_readability_index':
        textstat.automated_readability_index(text),
        'dale_chall_readability_score':
        textstat.dale_chall_readability_score(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'gunning_fog': textstat.gunning_fog(text),
        'text_standard': textstat.text_standard(text, float_output=True),
    }
示例#23
0
def score_comment(text):
    #counters
    nouncount = 0

    #remove code but give points for it
    #search for {code} and add points here
    codecount = text.count('{code')
    text = re.sub(r'{code:(.|\r|\n)*{code}', '', text)

    #Check for link to PR
    linktopr = text.count('https://github.com')
    text = re.sub(r'https://github.com.*/pull', '', text)

    # Check for links to things
    linktothings = text.count('https://')
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)

    # Count all sentences from all documents
    sentences = nltk.sent_tokenize(text)

    # tag speech
    tokens = nltk.word_tokenize(text)
    tagged_words = nltk.pos_tag(tokens)

    for type in tagged_words:
        if 'NN' in type[1]:
            nouncount += 1

    # Count Entities
    entities = nltk.chunk.ne_chunk(tagged_words, binary=True)
    named_entities = []

    for t in entities.subtrees():
        if t.label() == 'NE':
            named_entities.append(t)

    # Check Complexity of language grade level
    complexity = textstat.text_standard(text, float_output=True)

    score = len(named_entities) * 10 + len(
        sentences
    ) * 2.5 + nouncount + codecount * 5 + linktopr * 10 + linktothings * 5 + complexity

    #For cases where extra code and things add to the count
    if score > 100:
        return 100

    return score
示例#24
0
def analyze_vocab(text):
    return {
        'num_words': textstat.lexicon_count(text),
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'smog_index': textstat.smog_index(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'automated_readability_index':
        textstat.automated_readability_index(text),
        'dale_chall_readability_score':
        textstat.dale_chall_readability_score(text),
        'difficult_words': textstat.difficult_words(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'gunning_fog': textstat.gunning_fog(text),
        'text_standard': textstat.text_standard(text, float_output=True)
    }
示例#25
0
def vocab_check(text):
    
    #Construct dictionary
    vocab_results = {'dale_chall_readability_score': dale_chall_readability_score(text),
                     'smog_index': smog_index(text), 'gunning_fog': gunning_fog(text),
                     'flesch_reading_ease': flesch_reading_ease(text),
                     'flesch_kincaid_grade': flesch_kincaid_grade(text),
                     'linsear_write_formula': linsear_write_formula(text),
                     'coleman_liau_index': coleman_liau_index(text),
                     'automated_readability_index': automated_readability_index(text),
                     'yule_vocab_richness': yule(text),
                     'total_score': text_standard(text, float_output=True)}
                     
    diff_words, easy_word_dict = difficult_words(text)
    
    return(vocab_results, diff_words, easy_word_dict)
def extract_lexical_features(Authors):
    '''
    Extract the readability and typed-token-ratio features 
    Takes dictionary of authors as an input and returns the modified version.
    '''
    # On raw text, get average grade level of the tweets
    for author in Authors.keys():
        Authors[author].readability = 0
        for tweet in Authors[author].tweets:
            Authors[author].readability += (textstat.text_standard(tweet, float_output=True)/len(Authors[author].tweets)) # i am angery at textstat
    
    # On lemmatized text, get the TTR to determine the lexical diversity
    for author in Authors.keys():
        Authors[author].TTR = ld.ttr(Authors[author].clean)

    return Authors
示例#27
0
 def get_readability_features(self):
     sent_tokens = text_tokenizer(self.raw_text,
                                  replace_url_flag=True,
                                  tokenize_sent_flag=True)
     sentences = [' '.join(sent) + '\n' for sent in sent_tokens]
     sentences = ''.join(sentences)
     self.syllable_count = textstat.syllable_count(sentences)
     self.flesch_reading_ease = textstat.flesch_reading_ease(sentences)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(sentences)
     self.fog_scale = textstat.gunning_fog(sentences)
     self.smog = textstat.smog_index(sentences)
     self.automated_readability = textstat.automated_readability_index(
         sentences)
     self.coleman_liau = textstat.coleman_liau_index(sentences)
     self.linsear_write = textstat.linsear_write_formula(sentences)
     self.dale_chall_readability = textstat.dale_chall_readability_score(
         sentences)
     self.text_standard = textstat.text_standard(sentences)
示例#28
0
def process_file(train_file, test_file, topic_model_file):
    dfs = {
        'train': pd.read_csv(train_file),
        'test': pd.read_csv(test_file),
    }
    # Remove outdated columns
    columns_to_remove = [
        'coleman_liau_index',
        'automated_readability_index',
        'dale_chall_readability_score',
        'linsear_write_formula',
        'gunning_fog',
        'flesch_reading_ease',
        'Unnamed: 0',
        'Unnamed: 0.1',
        'Unnamed: 0.1.1',
    ]
    for key, df in dfs.items():
        for col in columns_to_remove:
            dfs[key] = dfs[key].drop(col, axis=1)
        print('Removed old columns')
        dfs[key]['readability_standard'] = df['text'].apply(
            lambda r: textstat.text_standard(r, float_output=True))
        print('Added readability')
        dfs[key]['sentiment'] = df['text'].apply(
            lambda r: TextBlob(r).sentiment.polarity)
        print('Added sentiment')
    # Add topic scores
    corpus, topic_model = load_topic_model(topic_model_file)
    topics_vectorizer = TopicModelVectorizer(topic_model, corpus)
    topic_scores = {}
    topic_scores['train'] = topics_vectorizer.fit_transform(dfs['train'])
    topic_scores['test'] = topics_vectorizer.transform(dfs['test'])
    print('Fetched topic scores')
    for key in dfs.keys():
        scores = topic_scores[key]
        scores_df = pd.DataFrame(
            data=scores,
            columns=[f'Topic #{i}' for i in range(scores.shape[1])],
            index=dfs[key].index)
        dfs[key] = dfs[key].merge(scores_df, left_index=True, right_index=True)
        print('Added topic scores')
    dfs['train'].to_csv(train_file, index=False)
    dfs['test'].to_csv(test_file, index=False)
 def score_text(self, test_data):
     score = {}
     score['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     score['smog_index'] = textstat.smog_index(test_data)
     score['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(
         test_data)
     score['coleman_liau_index'] = textstat.coleman_liau_index(test_data)
     score[
         'automated_readability_index'] = textstat.automated_readability_index(
             test_data)
     score[
         'dale_chall_readability_score'] = textstat.dale_chall_readability_score(
             test_data)
     score['difficult_words'] = textstat.difficult_words(test_data)
     score['linsear_write_formula'] = textstat.linsear_write_formula(
         test_data)
     score['gunning_fog'] = textstat.gunning_fog(test_data)
     score['text_standard'] = textstat.text_standard(test_data)
     return score
示例#30
0
    def _extract_readability_scores(self, text: Text, scores=None) -> Dict:

        output = {}
        if scores == None or 'flesch_reading_ease' in scores:
            output['flesch_reading_ease'] = textstat.flesch_reading_ease(text)

        if scores == None or 'smog_index' in scores:
            output['smog_index'] = textstat.smog_index(text)

        if scores == None or 'flesch_kincaid_grade' in scores:
            output['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(
                text)

        if scores == None or 'coleman_liau_index' in scores:
            output['coleman_liau_index'] = textstat.coleman_liau_index(text)

        if scores == None or 'automated_readability_index' in scores:
            output[
                'automated_readability_index'] = textstat.automated_readability_index(
                    text)

        if scores == None or 'dale_chall_readability_score' in scores:
            output[
                'dale_chall_readability_score'] = textstat.dale_chall_readability_score(
                    text)

        if scores == None or 'difficult_words' in scores:
            output['difficult_words'] = textstat.difficult_words(text)

        if scores == None or 'linsear_write_formula' in scores:
            output['linsear_write_formula'] = textstat.linsear_write_formula(
                text)

        if scores == None or 'gunning_fog' in scores:
            output['gunning_fog'] = textstat.gunning_fog(text)

        if scores == None or 'text_standard' in scores:
            output['text_standard'] = textstat.text_standard(text,
                                                             float_output=True)

        return output
示例#31
0
def text_analysis(test_data):
	#flesch_reading_ease: higher scores indicate material that is easier to read. aim for >60.0
	print ('flesch_reading_ease: '+str(textstat.flesch_reading_ease(test_data)))
	#smog_index: Calculates US grade level
	print ('smog_index: '+str(textstat.smog_index(test_data)))
	#flesch_kincaid_grade: Calculates US grade level
	print ('flesch_kincaid_grade: '+str(textstat.flesch_kincaid_grade(test_data)))
	#Colman Liau: Calculates US grade level
	print ('coleman_liau_index: '+str(textstat.coleman_liau_index(test_data)))
	#automated_readability_index: Calculates US grade level
	print ('automated_readability_index: '+str(textstat.automated_readability_index(test_data)))
	#Dale Chall Readability Score: 0.1579(dificult words / words *100) + 0.0496(words/sentences)
	print ('dale_chall_readability_score: '+str(textstat.dale_chall_readability_score(test_data)))
	#number of difficult words
	print ('difficult_words: '+str(textstat.difficult_words(test_data)))
	#Linsear Write: Calculates the U.S. grade level of a text sample based on sentence length and the number of words with three or more syllables. 
	print ('linsear_write_formula: '+str(textstat.linsear_write_formula(test_data)))
	#gunning_frog: The text can be understood by someone who left full-time education at a later age than the index
	print ('gunning_fog: '+str(textstat.gunning_fog(test_data)))
	#text_standard: Calculates US grade level
	print ('text_standard: '+str(textstat.text_standard(test_data)))
示例#32
0
def test_unicode_support():
    textstat.text_standard(
        "\u3042\u308a\u304c\u3068\u3046\u3054\u3056\u3044\u307e\u3059")

    textstat.text_standard(u"ありがとうございます")