Python linsear_write_formula 예제들, textstat.linsear_write_formula Python 예제들

예제 #1

0

파일 보기

파일: DE_main.py 프로젝트: zzs-NLP/ACS-QG

def get_readibility(text, metric="flesch_kincaid_grade"):
    """
    Return a score which reveals a piece of text's readability level.
    Reference: https://chartbeat-labs.github.io/textacy/getting_started/quickstart.html
               https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    """
    if metric == "flesch_kincaid_grade":
        result = textstat.flesch_kincaid_grade(text)
    elif metric == "flesch_reading_ease":
        result = textstat.flesch_reading_ease(text)
    elif metric == "smog_index":
        result = textstat.smog_index(text)
    elif metric == "coleman_liau_index":
        result = textstat.coleman_liau_index(text)
    elif metric == "automated_readability_index":
        result = textstat.automated_readability_index(text)
    elif metric == "dale_chall_readability_score":
        result = textstat.dale_chall_readability_score(text)
    elif metric == "difficult_words":
        result = textstat.difficult_words(text)
    elif metric == "linsear_write_formula":
        result = textstat.linsear_write_formula(text)
    elif metric == "gunning_fog":
        result = textstat.gunning_fog(text)
    elif metric == "text_standard":
        result = textstat.text_standard(text)
    else:
        print("ERROR: Please select correct metric!")
        result = None
    return result

예제 #2

0

파일 보기

def get_stats(text):
    fre = textstat.flesch_reading_ease(text)
    smog = textstat.smog_index(text)
    fkg = textstat.flesch_kincaid_grade(text)
    cli = textstat.coleman_liau_index(text)
    ari = textstat.automated_readability_index(text)
    dcr = textstat.dale_chall_readability_score(text)
    diff_words = textstat.difficult_words(text)
    lwf = textstat.linsear_write_formula(text)
    gunn_fog = textstat.gunning_fog(text)
    consolidated_score = textstat.text_standard(text)

    doc_length = len(text)  # think about excluding spaces?
    quote_count = text.count('"')

    stats = {
        "flesch_reading_ease": fre,
        "smog_index": smog,
        "flesch_kincaid_grade": fkg,
        "coleman_liau_index": cli,
        "automated_readability_index": ari,
        "dale_chall_readability_score": dcr,
        "difficult_words": diff_words,
        "linsear_write_formula": lwf,
        "gunning_fog": gunn_fog,
        "consolidated_score": consolidated_score,
        "doc_length": doc_length,
        "quote_count": quote_count
    }
    return stats

예제 #3

0

파일 보기

def seven_test(processed_essay):
    """
    score which is assigned to every script in on the basis of some predifened fomulas
    These scores are known as readability score.
    flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write
    :param processed_essay:
    :return:flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write
    """
    flesch_score = ["FS"]
    gunning_index = ["GI"]
    kincaid_grade = ["KG"]
    liau_index = ["LI"]
    automated_readability_index = ["ARI"]
    dale_readability_score = ["DLS"]
    difficult_word = ["DW"]
    linsear_write = ["LW"]
    for v in processed_essay:
        flesch_score.append(textstat.flesch_reading_ease(str(v)))
        gunning_index.append(textstat.gunning_fog(str(v)))
        kincaid_grade.append(textstat.flesch_kincaid_grade(str(v)))
        liau_index.append(textstat.coleman_liau_index(str(v)))
        automated_readability_index.append(textstat.automated_readability_index(str(v)))
        dale_readability_score.append(textstat.dale_chall_readability_score(str(v)))
        difficult_word.append(textstat.difficult_words(str(v)))
        linsear_write.append(textstat.linsear_write_formula(str(v)))
    return flesch_score,gunning_index,kincaid_grade,liau_index,automated_readability_index,dale_readability_score,difficult_word,linsear_write

예제 #4

0

파일 보기

def get_readability_score(text, metric="flesch"):
    global tknzr, DIFFICULT

    text = text.replace("’", "'")

    # https://pypi.org/project/textstat/
    if metric == "flesch":
        return textstat.flesch_reading_ease(text)
    elif metric == "smog":
        return textstat.smog_index(text)
    elif metric == "coleman_liau_index":
        return textstat.coleman_liau_index(text)
    elif metric == "automated_readability_index":
        return textstat.automated_readability_index(text)
    elif metric == "dale_chall_readability_score":
        return textstat.dale_chall_readability_score(text)
    elif metric == "difficult_words":
        nb_difficult = 0
        nb_easy = 0
        for w in set(tknzr.tokenize(text.lower())):
            if w not in EASY_WORDS and len(w) >= 6:
                nb_difficult += 1
            else:
                nb_easy += 1
        return 100 * nb_difficult / (nb_difficult + nb_easy)
        #return textstat.difficult_words(text)#/len(text.split())
    elif metric == "linsear_write_formula":
        return textstat.linsear_write_formula(text)
    elif metric == "gunning_fog":
        return textstat.gunning_fog(text)
    elif metric == "avg_word_length":
        words = tknzr.tokenize(text)
        words = [w for w in words if w not in misc_utils.PUNCT]
        if len(words) == 0: return 0
        return np.average([len(w) for w in words])

예제 #5

0

파일 보기

def analyze():
    print(request)
    str_to_read = request.data.decode("utf-8").strip()

    report = {
        "flesch-reading-ease":
        textstat.flesch_reading_ease(str_to_read),
        "smog-index":
        textstat.smog_index(str_to_read),
        "flesch-kincaid-grade":
        textstat.flesch_kincaid_grade(str_to_read),
        "coleman-liau-index":
        textstat.coleman_liau_index(str_to_read),
        "automated-readability-index":
        textstat.automated_readability_index(str_to_read),
        "dale-chall-readability-score":
        textstat.dale_chall_readability_score(str_to_read),
        "difficult-words":
        textstat.difficult_words(str_to_read),
        "linsear-write-formula":
        textstat.linsear_write_formula(str_to_read),
        "gunning-fog":
        textstat.gunning_fog(str_to_read),
        "text-standard":
        textstat.text_standard(str_to_read)
    }
    return decorate_response(jsonify(report))

예제 #6

0

파일 보기

def readability(queries):
    scores = pd.DataFrame(columns=[
        'Flesch', 'Smog', 'Flesch grade', 'Coleman', 'Automated', 'Dale',
        'Difficult', 'Linsear', 'Gunning', 'Text Standard'
    ])

    scores = {
        'Flesch': [],
        'Smog': [],
        'Flesch grade': [],
        'Coleman': [],
        'Automated': [],
        'Dale': [],
        'Difficult': [],
        'Linsear': [],
        'Gunning': [],
        'Text Standard': []
    }
    for line in queries:
        # results = readability.getmeasures(line, lang='en')
        # frescores.append(results['readability grades']['FleschReadingEase'])
        # line = 'yao family wines . yao family wines is a napa valley producer founded in 2011 by yao ming , the chinese-born , five-time nba all star . now retired from the houston rockets , yao ming is the majority owner in yao family wines , which has entered the wine market with a luxury cabernet sauvignon sourced from napa valley vineyards .'
        scores['Flesch'].append(textstat.flesch_reading_ease(line))
        scores['Smog'].append(textstat.smog_index(line))
        scores['Flesch grade'].append(textstat.flesch_kincaid_grade(line))
        scores['Coleman'].append(textstat.coleman_liau_index(line))
        scores['Automated'].append(textstat.automated_readability_index(line))
        scores['Dale'].append(textstat.dale_chall_readability_score(line))
        scores['Difficult'].append(textstat.difficult_words(line))
        scores['Linsear'].append(textstat.linsear_write_formula(line))
        scores['Gunning'].append(textstat.gunning_fog(line))
        scores['Text Standard'].append(
            textstat.text_standard(line, float_output=True))

    return scores

예제 #7

0

파일 보기

파일: DataAnalysis.py 프로젝트: pra8eek/BiopicAnalysis

def getReadabilityMetrics(test_data):
    '''
        for a given article IN TEXT FORMAT, returns its readability metrics
        Uses textstat library, please install it
    '''
    metric = {
        "flesch_reading_ease":
        textstat.flesch_reading_ease(test_data),
        "smog_index":
        textstat.smog_index(test_data),
        "flesch_kincaid_grade":
        textstat.flesch_kincaid_grade(test_data),
        "coleman_liau_index":
        textstat.coleman_liau_index(test_data),
        "automated_readability_index":
        textstat.automated_readability_index(test_data),
        "dale_chall_readability_score":
        textstat.dale_chall_readability_score(test_data),
        "difficult_words":
        textstat.difficult_words(test_data),
        "linsear_write_formula":
        textstat.linsear_write_formula(test_data),
        "gunning_fog":
        textstat.gunning_fog(test_data),
        "text_standard":
        textstat.text_standard(test_data)
    }
    return metric

예제 #8

0

파일 보기

파일: feature_engineering.py 프로젝트: aktilot/insight2019

def textstat_stats(text):
    doc_length = len(text.split()) 
    flesch_ease = ts.flesch_reading_ease(text) #Flesch Reading Ease Score
    flesch_grade = ts.flesch_kincaid_grade(text) #Flesch-Kincaid Grade Level
    gfog = ts.gunning_fog(text) # FOG index, also indicates grade level
#    smog = ts.smog_index(text) # SMOG index, also indicates grade level, only useful on 30+ sentences
    auto_readability = ts.automated_readability_index(text) #approximates the grade level needed to comprehend the text.
    cl_index = ts.coleman_liau_index(text) #grade level of the text using the Coleman-Liau Formula.
    lw_formula = ts.linsear_write_formula(text) #grade level using the Linsear Write Formula.
    dcr_score = ts.dale_chall_readability_score(text) #uses a lookup table of the most commonly used 3000 English words
#    text_standard = ts.text_standard(text, float_output=False) # summary of all the grade level functions
    syll_count = ts.syllable_count(text, lang='en_US')
    syll_count_scaled = syll_count / doc_length
    lex_count = ts.lexicon_count(text, removepunct=True)
    lex_count_scaled = lex_count / doc_length
    idx = ['flesch_ease', 'flesch_grade','gfog',
           'auto_readability','cl_index','lw_formula',
           'dcr_score', 
#           'text_standard', 
           'syll_count', 'lex_count']
    return pd.Series([flesch_ease, flesch_grade, gfog, 
                      auto_readability, cl_index, lw_formula, 
                      dcr_score, 
#                      text_standard, 
                      syll_count_scaled, lex_count_scaled], index = idx)

예제 #9

0

파일 보기

파일: text_utils.py 프로젝트: titipata/scipdf_parser

def compute_readability_stats(text):
    """
    Compute reading statistics of the given text
    Reference: https://github.com/shivam5992/textstat

    Parameters
    ==========
    text: str, input section or abstract text
    """
    try:
        readability_dict = {
            'flesch_reading_ease':
            textstat.flesch_reading_ease(text),
            'smog':
            textstat.smog_index(text),
            'flesch_kincaid_grade':
            textstat.flesch_kincaid_grade(text),
            'coleman_liau_index':
            textstat.coleman_liau_index(text),
            'automated_readability_index':
            textstat.automated_readability_index(text),
            'dale_chall':
            textstat.dale_chall_readability_score(text),
            'difficult_words':
            textstat.difficult_words(text),
            'linsear_write':
            textstat.linsear_write_formula(text),
            'gunning_fog':
            textstat.gunning_fog(text),
            'text_standard':
            textstat.text_standard(text),
            'n_syllable':
            textstat.syllable_count(text),
            'avg_letter_per_word':
            textstat.avg_letter_per_word(text),
            'avg_sentence_length':
            textstat.avg_sentence_length(text)
        }
    except:
        readability_dict = {
            'flesch_reading_ease': None,
            'smog': None,
            'flesch_kincaid_grade': None,
            'coleman_liau_index': None,
            'automated_readability_index': None,
            'dale_chall': None,
            'difficult_words': None,
            'linsear_write': None,
            'gunning_fog': None,
            'text_standard': None,
            'n_syllable': None,
            'avg_letter_per_word': None,
            'avg_sentence_length': None
        }
    return readability_dict

예제 #10

0

파일 보기

    def score(self, strText):
        self.automated_readability_index = textstat.automated_readability_index(
            strText)
        self.str_automated_readability_index = self.grade(
            self.automated_readability_index)

        self.coleman_liau_index = textstat.coleman_liau_index(strText)
        self.str_coleman_liau_index = self.grade(self.coleman_liau_index)

        self.dale_chall_readability_score = textstat.dale_chall_readability_score(
            strText)
        if self.dale_chall_readability_score >= 9.0:
            self.str_dale_chall_readability_score = ' | ' + '13th to 15th grade (college)'
        elif self.dale_chall_readability_score >= 8.0:
            self.str_dale_chall_readability_score = ' | ' + '11th to 12th grade'
        elif self.dale_chall_readability_score >= 7.0:
            self.str_dale_chall_readability_score = ' | ' + '9th to 10th grade'
        elif self.dale_chall_readability_score >= 6.0:
            self.str_dale_chall_readability_score = ' | ' + '7th to 8th grade'
        elif self.dale_chall_readability_score >= 5.0:
            self.str_dale_chall_readability_score = ' | ' + '5th to 6th grade'
        else:
            self.str_dale_chall_readability_score = ' | ' + '4th grade or lower'

        self.difficult_words = textstat.difficult_words(strText)

        self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(strText)
        self.str_flesch_kincaid_grade = self.grade(self.flesch_kincaid_grade)

        self.flesch_reading_ease = textstat.flesch_reading_ease(strText)
        if self.flesch_reading_ease >= 90:
            self.str_flesch_reading_ease = ' | ' + 'Very Easy'
        elif self.flesch_reading_ease >= 80:
            self.str_flesch_reading_ease = ' | ' + 'Easy'
        elif self.flesch_reading_ease >= 70:
            self.str_flesch_reading_ease = ' | ' + 'Fairly Easy'
        elif self.flesch_reading_ease >= 60:
            self.str_flesch_reading_ease = ' | ' + 'Standard'
        elif self.flesch_reading_ease >= 50:
            self.str_flesch_reading_ease = ' | ' + 'Fairly Difficult'
        elif self.flesch_reading_ease >= 30:
            self.str_flesch_reading_ease = ' | ' + 'Difficult'
        else:
            self.str_flesch_reading_ease = ' | ' + 'Very Confusing'

        self.gunning_fog = textstat.gunning_fog(strText)
        self.str_gunning_fog = self.grade(self.gunning_fog)

        self.linsear_write_formula = textstat.linsear_write_formula(strText)
        self.str_linsear_write_formula = self.grade(self.linsear_write_formula)

        self.smog_index = textstat.smog_index(strText)
        self.str_smog_index = self.grade(self.smog_index)

        self.text_standard = textstat.text_standard(strText)

예제 #11

0

파일 보기

    def process(self, df):

        t0 = time()
        print("\n---Generating Readability Features:---\n")

        def lexical_diversity(text):
            words = nltk.tokenize.word_tokenize(text.lower())
            word_count = len(words)
            vocab_size = len(set(words))
            diversity_score = vocab_size / word_count
            return diversity_score

        def get_counts(text, word_list):
            words = nltk.tokenize.word_tokenize(text.lower())
            count = 0
            for word in words:
                if word in word_list:
                    count += 1
            return count

        df['flesch_reading_ease'] = df['articleBody'].map(lambda x: textstat.flesch_reading_ease(x))
        df['smog_index'] = df['articleBody'].map(lambda x: textstat.smog_index(x))
        df['flesch_kincaid_grade'] = df['articleBody'].map(lambda x: textstat.flesch_kincaid_grade(x))
        df['coleman_liau_index'] = df['articleBody'].map(lambda x: textstat.coleman_liau_index(x))
        df['automated_readability_index'] = df['articleBody'].map(lambda x: textstat.automated_readability_index(x))
        df['dale_chall_readability_score'] = df['articleBody'].map(lambda x: textstat.dale_chall_readability_score(x))
        df['difficult_words'] = df['articleBody'].map(lambda x: textstat.difficult_words(x))
        df['linsear_write_formula'] = df['articleBody'].map(lambda x: textstat.linsear_write_formula(x))
        df['gunning_fog'] = df['articleBody'].map(lambda x: textstat.gunning_fog(x))
        df['i_me_myself'] = df['articleBody'].apply(get_counts,args = (['i', 'me', 'myself'],))
        df['punct'] = df['articleBody'].apply(get_counts,args = ([',','.', '!', '?'],))
        df['lexical_diversity'] = df['articleBody'].apply(lexical_diversity)

        feats = ['flesch_reading_ease', 'smog_index', 'flesch_kincaid_grade',
        'coleman_liau_index', 'automated_readability_index', 
        'dale_chall_readability_score', 'difficult_words', 'linsear_write_formula',
        'gunning_fog', 'i_me_myself', 'punct', 'lexical_diversity'
        ]


        outfilename_xReadable = df[feats].values

        with open('../saved_data/read.pkl', 'wb') as outfile:
            pickle.dump(feats, outfile, -1)
            pickle.dump(outfilename_xReadable, outfile, -1)

        print ('readable features saved in read.pkl')
        
        print('\n---Readability Features is complete---')
        print("Time taken {} seconds\n".format(time() - t0))
        
        return 1

예제 #12

0

파일 보기

파일: complexity.py 프로젝트: ghoulmann/more_eyes

 def readability_scores(self, text):
     self.ari = textstat.automated_readability_index(text)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     self.coleman_liau_index = textstat.coleman_liau_index(text)
     self.dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     self.flesch_reading_ease = textstat.flesch_reading_ease(text)
     self.gunning_fog = textstat.gunning_fog(text)
     self.linsear_write_formula = textstat.linsear_write_formula(text)
     self.lix = textstat.lix(text)
     self.rix = textstat.rix(text)
     self.smog_index = textstat.smog_index(text)
     self.text_standard = textstat.text_standard(text)

예제 #13

0

파일 보기

def get_readability_stats(text):
    return {
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'smog_index': textstat.smog_index(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'automated_readability_index':
        textstat.automated_readability_index(text),
        'dale_chall_readability_score':
        textstat.dale_chall_readability_score(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'gunning_fog': textstat.gunning_fog(text),
        'text_standard': textstat.text_standard(text, float_output=True),
    }

예제 #14

0

파일 보기

파일: vocabulary.py 프로젝트: xkrieg/hyogen_helper

def vocab_check(text):
    
    #Construct dictionary
    vocab_results = {'dale_chall_readability_score': dale_chall_readability_score(text),
                     'smog_index': smog_index(text), 'gunning_fog': gunning_fog(text),
                     'flesch_reading_ease': flesch_reading_ease(text),
                     'flesch_kincaid_grade': flesch_kincaid_grade(text),
                     'linsear_write_formula': linsear_write_formula(text),
                     'coleman_liau_index': coleman_liau_index(text),
                     'automated_readability_index': automated_readability_index(text),
                     'yule_vocab_richness': yule(text),
                     'total_score': text_standard(text, float_output=True)}
                     
    diff_words, easy_word_dict = difficult_words(text)
    
    return(vocab_results, diff_words, easy_word_dict)

예제 #15

0

파일 보기

파일: scheduler.py 프로젝트: Intellinewz/intellinews_api

def analyze_vocab(text):
    return {
        'num_words': textstat.lexicon_count(text),
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'smog_index': textstat.smog_index(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'automated_readability_index':
        textstat.automated_readability_index(text),
        'dale_chall_readability_score':
        textstat.dale_chall_readability_score(text),
        'difficult_words': textstat.difficult_words(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'gunning_fog': textstat.gunning_fog(text),
        'text_standard': textstat.text_standard(text, float_output=True)
    }

예제 #16

0

파일 보기

파일: mongo.py 프로젝트: abir-chakroun/Machine-learning-project

def lisibilty(text):

    f_lis = ([
        textstat.syllable_count(str(text), lang='en_arabic'),
        textstat.lexicon_count(str(text), removepunct=True),
        textstat.sentence_count(str(text)),
        textstat.flesch_reading_ease(str(text)),
        textstat.flesch_kincaid_grade(str(text)),
        textstat.gunning_fog(str(text)),
        textstat.smog_index(str(text)),
        textstat.automated_readability_index(str(text)),
        textstat.coleman_liau_index(str(text)),
        textstat.linsear_write_formula(str(text)),
        textstat.dale_chall_readability_score(str(text))
    ])
    return f_lis

예제 #17

0

파일 보기

파일: textstat.py 프로젝트: suraj-swaroop/Wikiplugin

def textstat_stats(text):
    difficulty = textstat.flesch_reading_ease(text)
    grade_difficulty = textstat.flesch_kincaid_grade(text)
    gfog = textstat.gunning_fog(text)
    smog = textstat.smog_index(text)
    ari = textstat.automated_readability_index(text)
    cli = textstat.coleman_liau_index(text)
    lwf = textstat.linsear_write_formula(text)
    dcrs = textstat.dale_chall_readability_score(text)
    idx = [
        'difficulty', 'grade_difficulty', 'gfog', 'smog', 'ari', 'cli', 'lwf',
        'dcrs'
    ]

    return pd.Series(
        [difficulty, grade_difficulty, gfog, smog, ari, cli, lwf, dcrs],
        index=idx)

예제 #18

0

파일 보기

 def get_readability_features(self):
     sent_tokens = text_tokenizer(self.raw_text,
                                  replace_url_flag=True,
                                  tokenize_sent_flag=True)
     sentences = [' '.join(sent) + '\n' for sent in sent_tokens]
     sentences = ''.join(sentences)
     self.syllable_count = textstat.syllable_count(sentences)
     self.flesch_reading_ease = textstat.flesch_reading_ease(sentences)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(sentences)
     self.fog_scale = textstat.gunning_fog(sentences)
     self.smog = textstat.smog_index(sentences)
     self.automated_readability = textstat.automated_readability_index(
         sentences)
     self.coleman_liau = textstat.coleman_liau_index(sentences)
     self.linsear_write = textstat.linsear_write_formula(sentences)
     self.dale_chall_readability = textstat.dale_chall_readability_score(
         sentences)
     self.text_standard = textstat.text_standard(sentences)

예제 #19

0

파일 보기

파일: shenanigans.py 프로젝트: rheophile10/twitter_shenanigans

 def score_text(self, test_data):
     score = {}
     score['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     score['smog_index'] = textstat.smog_index(test_data)
     score['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(
         test_data)
     score['coleman_liau_index'] = textstat.coleman_liau_index(test_data)
     score[
         'automated_readability_index'] = textstat.automated_readability_index(
             test_data)
     score[
         'dale_chall_readability_score'] = textstat.dale_chall_readability_score(
             test_data)
     score['difficult_words'] = textstat.difficult_words(test_data)
     score['linsear_write_formula'] = textstat.linsear_write_formula(
         test_data)
     score['gunning_fog'] = textstat.gunning_fog(test_data)
     score['text_standard'] = textstat.text_standard(test_data)
     return score

예제 #20

0

파일 보기

    def _extract_readability_scores(self, text: Text, scores=None) -> Dict:

        output = {}
        if scores == None or 'flesch_reading_ease' in scores:
            output['flesch_reading_ease'] = textstat.flesch_reading_ease(text)

        if scores == None or 'smog_index' in scores:
            output['smog_index'] = textstat.smog_index(text)

        if scores == None or 'flesch_kincaid_grade' in scores:
            output['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(
                text)

        if scores == None or 'coleman_liau_index' in scores:
            output['coleman_liau_index'] = textstat.coleman_liau_index(text)

        if scores == None or 'automated_readability_index' in scores:
            output[
                'automated_readability_index'] = textstat.automated_readability_index(
                    text)

        if scores == None or 'dale_chall_readability_score' in scores:
            output[
                'dale_chall_readability_score'] = textstat.dale_chall_readability_score(
                    text)

        if scores == None or 'difficult_words' in scores:
            output['difficult_words'] = textstat.difficult_words(text)

        if scores == None or 'linsear_write_formula' in scores:
            output['linsear_write_formula'] = textstat.linsear_write_formula(
                text)

        if scores == None or 'gunning_fog' in scores:
            output['gunning_fog'] = textstat.gunning_fog(text)

        if scores == None or 'text_standard' in scores:
            output['text_standard'] = textstat.text_standard(text,
                                                             float_output=True)

        return output

예제 #21

0

파일 보기

파일: serp_scraper.py 프로젝트: cdangerdouglas/SEO

def text_analysis(test_data):
	#flesch_reading_ease: higher scores indicate material that is easier to read. aim for >60.0
	print ('flesch_reading_ease: '+str(textstat.flesch_reading_ease(test_data)))
	#smog_index: Calculates US grade level
	print ('smog_index: '+str(textstat.smog_index(test_data)))
	#flesch_kincaid_grade: Calculates US grade level
	print ('flesch_kincaid_grade: '+str(textstat.flesch_kincaid_grade(test_data)))
	#Colman Liau: Calculates US grade level
	print ('coleman_liau_index: '+str(textstat.coleman_liau_index(test_data)))
	#automated_readability_index: Calculates US grade level
	print ('automated_readability_index: '+str(textstat.automated_readability_index(test_data)))
	#Dale Chall Readability Score: 0.1579(dificult words / words *100) + 0.0496(words/sentences)
	print ('dale_chall_readability_score: '+str(textstat.dale_chall_readability_score(test_data)))
	#number of difficult words
	print ('difficult_words: '+str(textstat.difficult_words(test_data)))
	#Linsear Write: Calculates the U.S. grade level of a text sample based on sentence length and the number of words with three or more syllables. 
	print ('linsear_write_formula: '+str(textstat.linsear_write_formula(test_data)))
	#gunning_frog: The text can be understood by someone who left full-time education at a later age than the index
	print ('gunning_fog: '+str(textstat.gunning_fog(test_data)))
	#text_standard: Calculates US grade level
	print ('text_standard: '+str(textstat.text_standard(test_data)))

예제 #22

0

파일 보기

파일: compute_readability.py 프로젝트: alessioferrari/DESIRA-WikiAnalysis-Repo

def print_readability(text_to_analyse, option='short'):
    if option == 'all':
        print(
            "flesch (0-29: confusing, 30-59: Difficult, 60-69: Standard, 70-100: Easy): ",
            textstat.flesch_reading_ease(text_to_analyse))
        print("smog (years of education required): ",
              textstat.smog_index(text_to_analyse))
        print(
            "flesch kinkaid (70-100: Fairly Easy; 60-70: Plain English; 30-60: Fairly Difficult; 30-0: Very Difficult): ",
            textstat.flesch_kincaid_grade(text_to_analyse))
        print("coleman liau: ", textstat.coleman_liau_index(text_to_analyse))
        print(
            "auto read (1-4: 5-10 years age; 5-8: 10-14 y; 9-12: 14-18 y; 13-14: 18+): ",
            textstat.automated_readability_index(text_to_analyse))
        print("dale chall (< 5: kid; 5-8: scholar; 9-10: college): ",
              textstat.dale_chall_readability_score(text_to_analyse))
        print("difficult words: ", textstat.difficult_words(text_to_analyse))
        print("linsear write: ",
              textstat.linsear_write_formula(text_to_analyse))
        print("gunning fog (9-12: High-school; 13-17: College): ",
              textstat.gunning_fog(text_to_analyse))

    print("text standard (estimated school grade level): ",
          textstat.text_standard(text_to_analyse))

예제 #23

0

파일 보기

        smog_index = textstat.smog_index(raw)
        worksheet.update("H" + row, smog_index)

        # Automated Readability Index
        # https://en.wikipedia.org/wiki/Automated_readability_index
        automated_readability_index = textstat.automated_readability_index(raw)
        worksheet.update("I" + row, automated_readability_index)

        # The Coleman-Liau Index
        # https://en.wikipedia.org/wiki/Coleman%E2%80%93Liau_index
        coleman_liau_index = textstat.coleman_liau_index(raw)
        worksheet.update("J" + row, coleman_liau_index)

        # Linsear Write Formula
        # https://en.wikipedia.org/wiki/Linsear_Write
        linsear_write_formula = textstat.linsear_write_formula(raw)
        worksheet.update("K" + row, linsear_write_formula)

        # Dale-Chall Readability Score
        # < 4.9 - average 4th-grade student | 5.0–5.9 - average 5th or 6th-grade
        # 6.0–6.9 - average 7th or 8th-grade | 7.0–7.9 - average 9th or 10th-grade
        # 8.0–8.9	average 11th or 12th-grade | 9.0–9.9 - college student
        dale_chall_readability_score = textstat.dale_chall_readability_score(
            raw)
        worksheet.update("L" + row, dale_chall_readability_score)

        # Readability Consensus based upon all the above tests
        # Estimated school grade level required to understand the text
        text_standard = textstat.text_standard(raw, float_output=False)
        worksheet.update("M" + row, text_standard)

예제 #24

0

파일 보기

파일: test.py 프로젝트: rinaldo-rex/textstat

def test_linsear_write_formula():
    textstat.set_lang("en_US")
    result = textstat.linsear_write_formula(long_test)

    assert result == 14.5

예제 #25

0

파일 보기

파일: test.py 프로젝트: shivam5992/textstat

def test_linsear_write_formula():
    result = textstat.linsear_write_formula(long_test)

    assert result == 14.5

예제 #26

0

파일 보기

def main(dir: str):
    checker = language_tool_python.LanguageTool('en-US')
    emails = {}
    totalWords = ''

    filenames = [
        filename for filename in os.listdir(dir) if filename.endswith('.eml')
    ]
    for filename in filenames:
        print()
        print('[INFO] Processing {}...'.format(filename))

        with open(os.path.join(dir, filename), 'r', encoding='latin1') as file:
            try:
                mail = mailparser.parse_from_file_obj(file)
            except Exception as e:
                print('[WARNING] Error while parsing: {}'.format(e))
                continue
            # filter duplicates based on subject
            #if mail.subject in emails:
            #    print('[WARNING] This email seems to be a duplicate of "{}"! Skipping...'
            #        .format(emails[mail.subject]['filename']))
            #    continue

            # don't process if auth results missing
            # if 'Authentication-Results' not in mail.headers:
            #     print('[WARNING] This email is missing an authentication results header! Skipping...')
            #     continue

            attachments = ''
            for attachment in mail.attachments:
                attachment['filename'] = re.sub(r'<|>', '',
                                                attachment['filename'])
            try:
                mail.write_attachments(dir)
                for attachment in mail.attachments:
                    if re.search('image', attachment['mail_content_type']):
                        if re.search('gif', attachment['mail_content_type']):
                            images, _, _ = gif2numpy.convert(
                                dir + '\\' + attachment['filename'])
                            img = images[0]
                        else:
                            img = cv2.imread(dir + '\\' +
                                             attachment['filename'])
                        img = cv2.resize(img,
                                         None,
                                         fx=1.2,
                                         fy=1.2,
                                         interpolation=cv2.INTER_CUBIC)
                        text = pytesseract.image_to_string(img)
                        attachments += text
                    elif re.search('pdf', attachment['mail_content_type']):
                        encoding = chardet.detect(
                            pdf_to_text(dir + '\\' +
                                        attachment['filename']))['encoding']
                        attachments += pdf_to_text(
                            dir + '\\' +
                            attachment['filename']).decode(encoding)
                    # elif re.search('text', attachment['mail_content_type']):
                    #     #print(chardet.detect((attachment['payload']).encode()))
                    #     #encoding = chardet.detect(base64.b64decode(attachment['payload']).encode())['encoding']
                    #     #attachments += base64.b64decode(attachment['payload']).decode(encoding)
                    #     #print(codecs.encode(base64.b64decode(attachment['payload']), encoding=attachment['content_transfer_encoding']))
                    #     attachments += attachment['payload']
                    else:
                        attachments += attachment['payload']
                    os.remove(dir + '\\' + attachment['filename'])
            except Exception as e:
                print(
                    '[WARNING] Error while parsing attachments: {}'.format(e))
                [
                    os.remove(dir + '\\' + attachment['filename'])
                    for attachment in mail.attachments
                ]

            body = mail.subject + ' ' + \
                   remove_noise(BeautifulSoup(mail.body, 'lxml').get_text(separator=' ', strip=True) +
                                BeautifulSoup(attachments, 'lxml').get_text())
            blob = TextBlob(body)
            totalWords = totalWords + " " + body.lower()
            grammarErrors = checker.check(body)

            if 'Authentication-Results' in mail.headers:
                spf = re.findall('spf=(\S*)',
                                 mail.headers['Authentication-Results'])
                dkim = re.findall('dkim=(\S*)',
                                  mail.headers['Authentication-Results'])
                dmarc = re.findall('dmarc=(\S*)',
                                   mail.headers['Authentication-Results'])
            else:
                spf = dkim = dmarc = ''

            emails[filename] = {
                'filename': filename,
                # 'hops': mail.received[-1]['hop'],
                # 'totalDelay': sum([hop['delay']/60 for hop in mail.received]),
                'spf': spf[0] if len(spf) else None,
                'dkim': dkim[0] if len(dkim) else None,
                'dmarc': dmarc[0] if len(dmarc) else None,
                'subject': mail.subject,
                'from': mail.from_[0][1],
                'to': [tup[1] for tup in mail.to],
                'replyTo': [tup[1] for tup in mail.reply_to],
                'attachments': [x['filename'] for x in mail.attachments],
                'grammarErrors': len(grammarErrors),
                'counts': {
                    'characterCount': len(body),
                    'wordCount': textstat.lexicon_count(body),
                    'sentenceCount': textstat.sentence_count(body)
                },
                'readability': {
                    'flesch_kincaid':
                    textstat.flesch_kincaid_grade(body),
                    'gunning_fog':
                    textstat.gunning_fog(body),
                    'smog_index':
                    textstat.smog_index(body),
                    'automated_readability_index':
                    textstat.automated_readability_index(body),
                    'coleman_liau_index':
                    textstat.coleman_liau_index(body),
                    'linsear_write':
                    textstat.linsear_write_formula(body),
                },
                'sentiment': {
                    'polarity': blob.sentiment.polarity,
                    'subjectivity': blob.sentiment.subjectivity
                }
            }

            if save_body:
                emails[filename]['body'] = body

    ## quit if nothing found ##
    # if not emails:
    #     print('[WARNING] No files were found in "{}"!'.format(dir))
    #     return

    ## writing all words to file ##
    with open(os.path.join(dir, 'words.txt'), 'w', encoding='utf-8') as file:
        file.write(totalWords.lower())

    ## output json ##
    with open(os.path.join(dir, 'analysis.json'), 'w') as jsonFile:
        json.dump(emails, jsonFile, indent=2)

    ## build and output csv ##

    # generate and output headers using first email
    column_headers = list(flatten_json(emails[list(emails.keys())[0]]).keys())
    csvFile = open(os.path.join(dir, 'analysis.csv'), 'w', encoding='utf-8')
    csvFile.write(',{}\n'.format(','.join(column_headers)))

    # generate and output one line per email
    for email in emails.keys():
        # flatten json to 1 layer deep
        flattened_email = flatten_json(emails[email])
        # generate the values for this row
        csv_values = [
            '"' + str(flattened_email[column_header]) + '"'
            for column_header in column_headers
        ]
        # add email name and join w/ commas, then write out
        csvFile.write('{},{}\n'.format('"' + email + '"',
                                       ','.join(csv_values)))

    csvFile.close()

    # print out stats
    print('{}/{} processed. The remaining failed for some reason.'.format(
        len(emails), len(filenames)))

예제 #27

0

파일 보기

    def test_linsear_write_formula(self):
        result = textstat.linsear_write_formula(self.long_test)

        self.assertEqual(14.5, result)

예제 #28

0

파일 보기

파일: ml_adaboost_allfeatures.py 프로젝트: amathur4/Authorship-Attribution

            if w[0].isupper():
                cnt += 1
        capital_count.append(cnt / len(cap_words))

        #obatining readability features
        reviews[i] = reviews[i].strip().lower().replace("\'", '')
        kingrade.append(textstat.flesch_kincaid_grade(reviews[i]))
        gunning.append(textstat.gunning_fog(reviews[i]))
        flesch_reading_ease1.append(textstat.flesch_reading_ease(reviews[i]))
        difficult_words1.append(textstat.difficult_words(reviews[i]))
        smog_index1.append(textstat.smog_index(reviews[i]))
        automated_readability_index1.append(
            textstat.automated_readability_index(reviews[i]))
        coleman_liau_index1.append(textstat.coleman_liau_index(reviews[i]))
        linsear_write_formula1.append(
            textstat.linsear_write_formula(reviews[i]))
        dale_chall_readability_score1.append(
            textstat.dale_chall_readability_score(reviews[i]))
        word_freq = []

        #obtaining punctuation count
        words = word_tokenize(reviews[i])
        punct = [w for w in words if w in ['.', ',', ';', '?', ':', '!']]
        punct_count.append(len(punct) / len(words))

        #obtaining stopwords frequency
        word = [
            w for w in words
            if w not in ['.', ',', ';', '?', ':', '!', '"', "'", '#']
        ]
        corpus.append(reviews[i])

예제 #29

0

파일 보기

파일: readability_nltk.py 프로젝트: ReadabilityCH/readability

def get_redability_assessments(data_text: str) -> Optional[dict]:
    divided_text = tokenize.sent_tokenize(data_text)
    word_tokenizes = nltk.word_tokenize(data_text)
    pos_tags = nltk.pos_tag(word_tokenizes)
    pos_tags_tagger = TAGGER.tag(word_tokenizes)
    f_dist = nltk.FreqDist(word_tokenizes)

    uniqueWordCount = compute_unique_word_count(f_dist.most_common())

    paragraphCount = max(len(data_text.split('\n')), len(data_text.split('\r\n')))

    counts = Counter(tag for word, tag in pos_tags)

    # Readability Grade Levels
    readability_grade_levels = dict(fleschKincaid=0, gunningFog=0, colemanLiau=0, smog=0,
                                    ari=0, forecastGradeLevel=0, powersSumnerKearlGrade=0, rix=0,
                                    raygorReadability=0, fryReadability=0, flesch=0)

    readability_grade_levels.update(fleschKincaid=textstat.flesch_kincaid_grade(data_text))
    readability_grade_levels.update(gunningFog=textstat.gunning_fog(data_text))
    readability_grade_levels.update(colemanLiau=textstat.coleman_liau_index(data_text))
    readability_grade_levels.update(smog=textstat.smog_index(data_text))
    readability_grade_levels.update(ari=textstat.automated_readability_index(data_text))
    readability_grade_levels.update(rix=textstat.rix(data_text))

    # need to check
    readability_grade_levels.update(forcastGradeLevel=round(20 - (textstat.avg_syllables_per_word(data_text) / 10), 2))

    readability_grade_levels.update(powersSumnerKearlGrade=round(textstat.avg_sentence_length(data_text) +
                                                                 textstat.avg_syllables_per_word(data_text) +
                                                                 2.7971, 2))
    readability_grade_levels.update(raygorReadability=count_raygor_readability(divided_text))
    readability_grade_levels.update(fryReadability=count_fry_readability(divided_text))
    # need to check

    readability_grade_levels.update(flesch=textstat.flesch_reading_ease(data_text))

    # Readability Scores
    readability_scores = dict(readableRating="", fleschReadingEase=0, cefrLevel='', ieltsLevel='', spacheScore=0,
                              newDaleChallScore=0, lixReadability=0, lensearWrite=0)
    readability_scores.update(readableRating=count_average_grade_levels(readability_grade_levels))
    readability_scores.update(fleschReadingEase=textstat.flesch_reading_ease(data_text))
    readability_scores.update(cefrLevel=count_cefr_levels(readability_grade_levels))
    readability_scores.update(ieltsLevel=count_ielts_levels(readability_grade_levels))
    readability_scores.update(spacheScore=round(textstat.spache_readability(data_text), 2))
    readability_scores.update(newDaleChallScore=textstat.dale_chall_readability_score_v2(data_text))
    readability_scores.update(lixReadability=textstat.lix(data_text))
    readability_scores.update(lensearWrite=textstat.linsear_write_formula(data_text))

    # Text Statistics
    text_statistics = dict(characterCount=0, syllableCount=0, wordCount=0, uniqueWordCount=0,
                           sentenceCount=0, paragraphCount=0)
    text_statistics.update(characterCount=textstat.char_count(data_text))
    text_statistics.update(syllableCount=textstat.syllable_count(data_text))
    text_statistics.update(wordCount=textstat.lexicon_count(data_text))
    text_statistics.update(uniqueWordCount=uniqueWordCount)
    text_statistics.update(sentenceCount=textstat.sentence_count(data_text))
    text_statistics.update(paragraphCount=paragraphCount)

    # Timings
    timings_statistics = dict(readingTime=0, speakingTime=0)
    timings_statistics.update(readingTime=reading_time(textstat.lexicon_count(data_text)))
    timings_statistics.update(speakingTime=speaking_time(textstat.lexicon_count(data_text)))

    # Text Composition
    text_composition = dict(adjectives=0, adverbs=0, conjunctions=0, determiners=0, interjections=0, nouns=0, verbs=0,
                            properNouns=0, prepositions=0, pronouns=0, qualifiers=0, unrecognised=0, nonWords=0)

    text_composition.update(adjectives=counts.get('JJ', 0) + counts.get('JJR', 0) + counts.get('JJS', 0))
    text_composition.update(adverbs=counts.get('RB', 0) + counts.get('RBR', 0) + counts.get('RBS', 0))
    text_composition.update(conjunctions=counts.get('CC', 0))
    text_composition.update(determiners=counts.get('DT', 0) + counts.get('PDT', 0) + counts.get('WDT', 0))
    text_composition.update(interjections=counts.get('UH', 0))
    text_composition.update(nouns=counts.get('NN', 0) + counts.get('NNS', 0))
    text_composition.update(
        verbs=counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get(
            'VBP', 0) + counts.get('VBZ', 0))
    text_composition.update(properNouns=counts.get('NNP', 0) + counts.get('NNPS', 0))
    text_composition.update(prepositions=counts.get('IN', 0))
    text_composition.update(
        pronouns=counts.get('PRP', 0) + counts.get('PRP$', 0) + counts.get('WP', 0) + counts.get('WP$', 0))
    text_composition.update(qualifiers=counts.get('RB', 0))
    text_composition.update(unrecognised=counts.get(None, 0))
    text_composition.update(nonWords=counts.get('.', 0) + counts.get(',', 0) + counts.get(':', 0))

    # Readability Issues
    text_readability_issues = dict(sentences30SyllablesCount=0, sentences20SyllablesCount=0,
                                   sentences30Syllables=[], sentences20Syllables=[],
                                   words4SyllablesCount=0, words12LettersCount=0,
                                   words4Syllables=[], words12Letters=[])

    sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count = count_sentences_syllables(
        divided_text)

    sentences_30_syllables = find_limit_offcet(data_text, sentences_30_syllables,
                                               "sentences_30_syllables",
                                               "sentences_30_syllables",
                                               "This sentence has more than 30 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.",
                                               "Readability Issues")
    sentences_20_syllables = find_limit_offcet(data_text, sentences_20_syllables,
                                               "sentences_20_syllables",
                                               "sentences_20_syllables",
                                               "This sentence has more than 20 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.",
                                               "Readability Issues")

    text_readability_issues.update(sentences30SyllablesCount=sentences_30_count,
                                   sentences20SyllablesCount=sentences_20_count)

    words_12_letters, words_12_count, words_4_syllables, words_4_count = words_sentence_syllables(divided_text)

    words_12_letters = find_limit_offcet(data_text, words_12_letters,
                                         "words_12_letters",
                                         "words_12_letters",
                                         "This word is more than 12 letters",
                                         "Readability Issues")
    words_4_syllables = find_limit_offcet(data_text, words_4_syllables,
                                          "words_4_syllables",
                                          "words_4_syllables",
                                          "This word is more than 4 syllables",
                                          "Readability Issues")

    text_readability_issues.update(words4SyllablesCount=words_4_count,
                                   words12LettersCount=words_12_count)

    # Writing Style Issues
    text_style_issues = dict(passiveVoiceCount=0, passiveVoices=[],
                             adverbsCount=0, adverbs=[],
                             clicheCount=0, cliches=[])
    passive_voises_return = find_passives(divided_text)
    passive_voises_return = find_limit_offcet(data_text, passive_voises_return,
                                              "passive_voises",
                                              "passive_voises",
                                              "Too much of using passive voises",
                                              "Writing Style Issues")
    adverbs_return = find_adverbs(pos_tags_tagger)
    adverbs_return = find_limit_offcet(data_text, adverbs_return,
                                       "adverbs",  # writing_style_issues
                                       "adverbs",
                                       "Too much of using adverbs",
                                       "Writing Style Issues")
    text_style_issues.update(passiveVoiceCount=len(passive_voises_return),
                             adverbsCount=len(adverbs_return))

    # Text Density Issues
    text_density_issues = dict(charactersPerWord=0, syllablesPerWord=0, wordsPerSentence=0,
                               wordsPerParagraph=0, sentencesPerParagraph=0)

    text_density_issues.update(charactersPerWord=textstat.avg_character_per_word(data_text),
                               syllablesPerWord=textstat.avg_syllables_per_word(data_text),
                               wordsPerSentence=round(textstat.lexicon_count(data_text) / len(divided_text), 2),
                               wordsPerParagraph=round(textstat.lexicon_count(data_text) / paragraphCount, 2),
                               sentencesPerParagraph=round(len(divided_text) / paragraphCount, 2))

    # Language Issues
    text_language_issues = dict(spellingIssuesCount=0, grammarIssueCount=0)

    matches_limit_offcet = sentences_20_syllables + sentences_30_syllables + words_4_syllables + words_12_letters + \
                           passive_voises_return + adverbs_return

    return dict(readabilityGradeLevels=readability_grade_levels,
                readabilityScores=readability_scores,
                textStatistics=text_statistics,
                timings=timings_statistics,
                textComposition=text_composition,
                textReadabilityIssues=text_readability_issues,
                textStyleIssues=text_style_issues,
                textDensityIssues=text_density_issues,
                textLanguageIssues=text_language_issues,
                matches=matches_limit_offcet)

예제 #30

0

파일 보기

파일: readibility_score.py 프로젝트: vageeshSaxena/Deep-Text-Eval

    "Dale-Chall Readability Score","Readability Consensus"]"""
    df = pd.DataFrame(columns=col_names)
    df["Sentences"] = sentences
    df["Word count"] = df["Sentences"].apply(lambda x: word_count(x))
    df["Sentence Length"] = df["Sentences"].apply(lambda x: sentence_count(x))
    df["Average Sentence length"] = df["Sentences"].apply(
        lambda x: avg_sentence_length(x))
    df["Syllable Count"] = df["Sentences"].apply(lambda x: syllables_count(x))
    df["Average syllables per words"] = df["Sentences"].apply(
        lambda x: avg_syllables_per_word(x))
    df["Polysyllablic count"] = df["Sentences"].apply(
        lambda x: poly_syllable_count(x))
    df["Lexicon Count"] = df["Sentences"].apply(lambda x: lexical_counts(x))
    df["Flesch Reading Ease score"] = df["Sentences"].apply(
        lambda x: flesch_reading_ease(x))
    df["Flesch-Kincaid Grade Level"] = df["Sentences"].apply(
        lambda x: textstat.flesch_kincaid_grade(x))
    df["Fog Scale"] = df["Sentences"].apply(lambda x: gunning_fog(x))
    df["SMOG Index"] = df["Sentences"].apply(lambda x: smog_index(x))
    df["Automated Readability Index"] = df["Sentences"].apply(
        lambda x: textstat.automated_readability_index(x))
    df["Coleman-Liau Index"] = df["Sentences"].apply(
        lambda x: textstat.coleman_liau_index(x))
    df["Linsear Write Formula"] = df["Sentences"].apply(
        lambda x: textstat.linsear_write_formula(x))
    df["Dale-Chall Readability Score"] = df["Sentences"].apply(
        lambda x: dale_chall_readability_score(x))
    df["Readability Consensus"] = df["Sentences"].apply(
        lambda x: textstat.text_standard(x, float_output=False))
    df.to_hdf('textstat_data.h5', key='textstat', mode='w')

예제 #31

0

파일 보기

def download(request):
    global tweetsList

    response = HttpResponse(content_type='application/x-download')
    response['Content-Disposition'] = 'attachment; filename="tweets.csv"'

    #set headers of csv
    fieldnames = ['datetime', 'last updated', 'original username', 'original screen name',
                  'original user location', 'original user verified', 'retweet', 'retweeter username',
                  'retweeter screen name', 'retweeter location', 'retweeter verified', 'text', 'comment',
                  # 'hashtags', 'urls', '#retweets','#favorites', '#retweets of retweet',
                  'hashtags', 'urls', '#retweets', '#favorites',
                  '#favorites of retweet', 'original syllable count', 'original lexicon count',
                  'original sentence count', 'original flesch reading ease score', 'original flesch-kincaid grade level',
                  'original fog scale', 'original smog index', 'original automated readability index', 'original coleman-liau index',
                  'original linsear write level', 'original dale-chall readability score', 'original difficult words',
                  'original readability consensus', 'original neg sentiment', 'original neu sentiment', 'original pos sentiment',
                  'original overall sentiment', 'comment syllable count', 'comment lexicon count',
                  'comment sentence count', 'comment flesch reading ease score', 'comment flesch-kincaid grade level',
                  'comment fog scale', 'comment smog index', 'comment automated readability index', 'comment coleman-liau index',
                  'comment linsear write level', 'comment dale-chall readability score', 'comment difficult words',
                  'comment readability consensus', 'comment neg sentiment', 'comment neu sentiment', 'comment pos sentiment',
                  'comment overall sentiment', 'combined syllable count', 'combined lexicon count',
                  'combined sentence count', 'combined flesch reading ease score', 'combined flesch-kincaid grade level',
                  'combined fog scale', 'combined smog index', 'combined automated readability index', 'combined coleman-liau index',
                  'combined linsear write level', 'combined dale-chall readability score', 'combined difficult words',
                  'combined readability consensus', 'combined neg sentiment', 'combined neu sentiment', 'combined pos sentiment',
                  'combined overall sentiment', 'twitter users query', 'twitter excluded users query', 'twitter hashtags query', 'twitter keywords query',
                  'twitter from date query', 'twitter to date query']

    writer = csv.writer(response, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(fieldnames)

    for tweet in tweetsList:
        #combine hashtags of tweet into string separated by commas
        hashtagString = ""
        tweetHashtags = HashtagLog.objects.filter(tweet__id=tweet.id)
        for i in range(len(tweetHashtags)):
            if i == 0:
                hashtagString += tweetHashtags[i].hashtag.hashtagText
            else:
                hashtagString += ", " + tweetHashtags[i].hashtag.hashtagText

        #combine urls of tweet into string separated by commas
        urlString = ""
        tweetUrls = UrlLog.objects.filter(tweet__id=tweet.id)
        for i in range(len(tweetUrls)):
            if i == 0:
                urlString += tweetUrls[i].url.urlText
            else:
                urlString += ", " + tweetUrls[i].url.urlText

        #display yes or no in verified column for original user
        if tweet.originalUser.isVerified:
            originalVerifiedString = "yes"
        else:
            originalVerifiedString = "no"

        #if not a retweet, new user fields should be empty
        newUsername = None
        newScreenName = None
        newLocation = None
        newVerifiedString = None

        #if retweet:
        #display yes or no in verified column for new user
        if tweet.newUser:
            if tweet.newUser.isVerified:
                newVerifiedString = "yes"
            else:
                newVerifiedString = "no"

            #set retweet fields
            newUsername = tweet.newUser.username
            newScreenName = tweet.newUser.screenName
            newLocation = tweet.newUser.location

        #display yes or no in retweet column
        if tweet.isRetweet:
            isRetweetString = "yes"
        else:
            isRetweetString = "no"

        #get sentiment scores of original text
        sid_obj = SentimentIntensityAnalyzer()
        sentiment_dict_original = sid_obj.polarity_scores(tweet.originalText)

        #combine comment text and original tezt and get sentiment scores for the combination
        commentText = ""
        if tweet.commentText:
            commentText = tweet.commentText
        sentiment_dict_combined = sid_obj.polarity_scores(tweet.originalText + commentText)

        #intialize all comment word processing to empty strings in case there is no comment text
        cSyllableCount = ""
        cLexiconCount = ""
        cSentenceCount = ""
        cFleschReadingEase = ""
        cFleschKincaidGrade = ""
        cGunningFog = ""
        cSmogIndex = ""
        cAutomatedReadabilityIndex = ""
        cColemanLiauIndex = ""
        cLinsearWriteFormula = ""
        cDaleChallReadabilityScore = ""
        cDifficultWords = ""
        cTextStandard = ""

        #if there is comment text, get language processing stats for comment text
        if tweet.commentText != None:
            cSyllableCount = textstat.syllable_count(tweet.commentText, lang='en_US')
            cLexiconCount = textstat.lexicon_count(tweet.commentText, removepunct=True)
            cSentenceCount = textstat.sentence_count(tweet.commentText)
            cFleschReadingEase = textstat.flesch_reading_ease(tweet.commentText)
            cFleschKincaidGrade = textstat.flesch_kincaid_grade(tweet.commentText)
            cGunningFog = textstat.gunning_fog(tweet.commentText)
            cSmogIndex = textstat.smog_index(tweet.commentText)
            cAutomatedReadabilityIndex = textstat.automated_readability_index(tweet.commentText)
            cColemanLiauIndex = textstat.coleman_liau_index(tweet.commentText)
            cLinsearWriteFormula = textstat.linsear_write_formula(tweet.commentText)
            cDaleChallReadabilityScore = textstat.dale_chall_readability_score(tweet.commentText)
            cDifficultWords = textstat.difficult_words(tweet.commentText)
            cTextStandard = textstat.text_standard(tweet.commentText, float_output=False)

        #get sentiment scores for comment text
        cNegSent = ""
        cNeuSent = ""
        cPosSent = ""
        cCompoundSent = ""
        if tweet.commentText:
            sentiment_dict_comment = sid_obj.polarity_scores(tweet.commentText)
            cNegSent = sentiment_dict_comment['neg']
            cNeuSent = sentiment_dict_comment['neu']
            cPosSent = sentiment_dict_comment['pos']
            cCompoundSent = sentiment_dict_comment['compound']

        #write all information about the tweet, and its language processing stats to row in csv
        writer.writerow(
            [tweet.createdAt, tweet.lastUpdated, tweet.originalUser.username,
             tweet.originalUser.screenName, tweet.originalUser.location, originalVerifiedString,
             isRetweetString, newUsername, newScreenName, newLocation, newVerifiedString,
             tweet.originalText, tweet.commentText, hashtagString, urlString, tweet.numRetweetsOriginal,
             # tweet.numFavoritesOriginal, tweet.numRetweetsNew, tweet.numFavoritesNew,
             tweet.numFavoritesOriginal, tweet.numFavoritesNew,
             textstat.syllable_count(tweet.originalText, lang='en_US'),
             textstat.lexicon_count(tweet.originalText, removepunct=True),
             textstat.sentence_count(tweet.originalText),
             textstat.flesch_reading_ease(tweet.originalText),
             textstat.flesch_kincaid_grade(tweet.originalText),
             textstat.gunning_fog(tweet.originalText),
             textstat.smog_index(tweet.originalText),
             textstat.automated_readability_index(tweet.originalText),
             textstat.coleman_liau_index(tweet.originalText),
             textstat.linsear_write_formula(tweet.originalText),
             textstat.dale_chall_readability_score(tweet.originalText),
             textstat.difficult_words(tweet.originalText),
             textstat.text_standard(tweet.originalText, float_output=False),
             sentiment_dict_original['neg'], sentiment_dict_original['neu'],
             sentiment_dict_original['pos'], sentiment_dict_original['compound'], cSyllableCount,
             cLexiconCount, cSentenceCount, cFleschReadingEase, cFleschKincaidGrade, cGunningFog,
             cSmogIndex, cAutomatedReadabilityIndex, cColemanLiauIndex, cLinsearWriteFormula, cDaleChallReadabilityScore,
             cDifficultWords, cTextStandard, cNegSent, cNeuSent, cPosSent, cCompoundSent,
             textstat.syllable_count(tweet.originalText + commentText, lang='en_US'),
             textstat.lexicon_count(tweet.originalText + commentText, removepunct=True),
             textstat.sentence_count(tweet.originalText + commentText),
             textstat.flesch_reading_ease(tweet.originalText + commentText),
             textstat.flesch_kincaid_grade(tweet.originalText + commentText),
             textstat.gunning_fog(tweet.originalText + commentText),
             textstat.smog_index(tweet.originalText + commentText),
             textstat.automated_readability_index(tweet.originalText + commentText),
             textstat.coleman_liau_index(tweet.originalText + commentText),
             textstat.linsear_write_formula(tweet.originalText + commentText),
             textstat.dale_chall_readability_score(tweet.originalText + commentText),
             textstat.difficult_words(tweet.originalText + commentText),
             textstat.text_standard(tweet.originalText + commentText, float_output=False),
             sentiment_dict_combined['neg'], sentiment_dict_combined['neu'],
             sentiment_dict_combined['pos'], sentiment_dict_combined['compound'],
             tweet.twitterQueryUsers, tweet.twitterQueryNotUsers,
             tweet.twitterQueryHashtags, tweet.twitterQueryKeywords,
             tweet.twitterQueryFromDate, tweet.twitterQueryToDate]
        )

    return response