예제 #1
0
    def transform(self, input_df: pd.DataFrame) -> coo_matrix:
        """
        It computes and returns the linguistic features from the input DF. The DF must include the following attributes
        in its columns: Q_TEXT, Q_ID
        :param input_df:
        :return:
        """
        if Q_TEXT not in input_df.columns:
            raise ValueError("Q_TEXT should be in input_df.columns")
        if Q_ID not in input_df.columns:
            raise ValueError("Q_ID should be in input_df.columns")

        correct_ans_text_dict = gen_correct_answers_dict(input_df)
        wrong_ans_text_dict = gen_wrong_answers_dict(input_df)

        df = pd.DataFrame()
        df['lexicon_count_question'] = input_df.apply(
            lambda r: textstat.lexicon_count(r[Q_TEXT]), axis=1)
        df['lexicon_count_correct_choices'] = input_df.apply(
            lambda r: np.mean([
                textstat.lexicon_count(x)
                for x in correct_ans_text_dict[r[Q_ID]]
            ]),
            axis=1)
        df['lexicon_count_wrong_choices'] = input_df.apply(lambda r: np.mean(
            [textstat.lexicon_count(x) for x in wrong_ans_text_dict[r[Q_ID]]]),
                                                           axis=1)
        df['sentence_count_question'] = input_df.apply(
            lambda r: textstat.sentence_count(r[Q_TEXT]), axis=1)
        df['sentence_count_correct_choices'] = input_df.apply(
            lambda r: np.mean([
                textstat.sentence_count(x)
                for x in correct_ans_text_dict[r[Q_ID]]
            ]),
            axis=1)
        df['sentence_count_wrong_choices'] = input_df.apply(lambda r: np.mean(
            [textstat.sentence_count(x)
             for x in wrong_ans_text_dict[r[Q_ID]]]),
                                                            axis=1)
        df['avg_word_len_question'] = input_df.apply(
            lambda r: np.mean([len(x) for x in r[Q_TEXT].split(' ')]), axis=1)
        df['ratio_len_question_correct_choices'] = df.apply(
            lambda r: (1 + r['lexicon_count_question']) /
            (1 + r['lexicon_count_correct_choices']),
            axis=1)
        df['ratio_len_question_wrong_choices'] = df.apply(
            lambda r: (1 + r['lexicon_count_question']) /
            (1 + r['lexicon_count_wrong_choices']),
            axis=1)
        return coo_matrix(df.values)
예제 #2
0
 def check_difficulty(self):
     text = self.textoutput
     #0-30 = college
     #50-60 = high school
     #60+ = middle school/elementary school
     try:
         grade_level = textstat.text_standard(text)
         reading_ease = textstat.flesch_reading_ease(text)  #requires chart
         sentence_count = textstat.sentence_count(text)
         difficult_words = self.get_difficult_words(text)
         replacement_words = self.get_replacement_words(difficult_words)
         output = "Grade Level of Input Text: " + grade_level + "\n"
         #output = output + "Ease of Reading*: " + str(reading_ease) + "\n"
         output = output + "Sentence Count: " + str(sentence_count) + "\n"
         output = output + "Difficult Words Found: " + str(
             len(difficult_words)) + "\n"
         output = output + "Possible Replacements: " + "\n"
         for dw in replacement_words:
             output = output + dw + " -> "
             for word in replacement_words[dw]:
                 output = output + word + ", "
             output = output + "\n"
         self.difficultyReport = output
     except:
         self.difficultyReport = "Error determining Difficulties"
예제 #3
0
 def statistics(self, text):
     self.asl = textstat.avg_sentence_length(text)
     self.avg_sentence_per_word = textstat.avg_sentence_per_word(text)
     self.avg_syllables_per_word = textstat.avg_syllables_per_word(text)
     self.difficult_words = textstat.difficult_words(text)
     self.lexicon_count = textstat.lexicon_count(text)
     self.polysyllable_count = textstat.polysyllabcount(text)
     self.sentence_count = textstat.sentence_count(text)
def avg_sentence_len() -> List:
    """return sentence length in each policy
    """
    count = []
    for text in policies['Policy']:
        w_count = len(text.split())
        count.append(int(w_count / textstat.sentence_count(text)))
    return count
def do_datas():
    # logging.info('do_datas')

    ########### Save text statistics
    ##### 1. nw 2. nvocab 3. nsyllable 4.nsentence 5. tone 6. readability
    ## 1. nw
    nw.append(len(words))
    ## 2. nvocab
    nvocab.append(len(vocab))
    ## 3. syllable
    n = textstat.syllable_count(contents)
    nsyllable.append(n)
    ## 4. sentence
    n = textstat.sentence_count(contents)
    nsentence.append(n)
    ## 5. tone
    ### LM dictionary
    n_neg_lm.append(count_occurrence(words, lm_neg))
    n_pos_lm.append(count_occurrence(words, lm_pos))
    n_uctt_lm.append(count_occurrence(words, lm_uctt))
    n_lit_lm.append(count_occurrence(words, lm_lit))
    n_cstr_lm.append(count_occurrence(words, lm_cstr))
    n_modal1_lm.append(count_occurrence(words, lm_modal1))
    n_modal2_lm.append(count_occurrence(words, lm_modal2))
    n_modal3_lm.append(count_occurrence(words, lm_modal3))
    n_negation_lm.append(count_negation(words, lm_pos, gt_negation))
    ### General Inquirer dictionary
    n_neg_gi.append(count_occurrence(words, gi_neg))
    n_pos_gi.append(count_occurrence(words, gi_pos))
    n_negation_gi.append(count_negation(words, gi_pos, gt_negation))
    ### Henry dictionary
    n_neg_hr.append(count_occurrence(words, hr_neg))
    n_pos_hr.append(count_occurrence(words, hr_pos))
    n_negation_hr.append(count_negation(words, gi_pos, gt_negation))
    ## 4. readability
    fre_i = textstat.flesch_reading_ease(contents)
    if fre_i > 100:
        fre_i = 100
    if fre_i < 0:
        fre_i = float('NaN')
    fre.append(fre_i)
    fkg_i = textstat.flesch_kincaid_grade(contents)
    if fkg_i < 0:
        fkg_i = float('NaN')
    fkg.append(fkg_i)
    # RIX
    cl_i = textstat.coleman_liau_index(contents)
    if cl_i < 0:
        cl_i = float('NaN')
    cl.append(cl_i)
    f = textstat.gunning_fog(contents)
    fog.append(f)
    f = textstat.automated_readability_index(contents)
    ari.append(f)
    f = textstat.smog_index(contents)
    smog.append(f)
예제 #6
0
def display_scores(name: str, text: str) -> None:
    nameLabel = Label(root, text=name+": ", font=12, pady=10)
    nameLabel.pack()
    fleschEaseScore = textstat.flesch_reading_ease(text)
    fESAll = fleschIndex.fleschscore()
    fESAll.sort()
    print(fESAll)
    percentile = 100
    for i in range(len(fESAll)):
        if fleschEaseScore < fESAll[i]:
            percentile = int(100 * (i / len(fESAll)))
            break

    fESContent = "Flesch Reading Ease Score: {},\n which is in the {}th percentile.".format(fleschEaseScore, percentile)
    fESLbl = Label(root, text=fESContent, font=12, pady=10)
    fESLbl.pack()

    flKinScore = textstat.flesch_kincaid_grade(text)
    fKSAll = fleschIndex.fleschkincaid()
    fKSAll.sort(reverse=True)
    print(fKSAll)
    percentile = 100
    for i in range(len(fKSAll)):
        if flKinScore > fKSAll[i]:
            percentile = int(100 * (i / len(fKSAll)))
            break

    fKSContent = "Flesch-Kincaid Grade: {},\n which is in the {}th percentile.".format(flKinScore, percentile)
    fKSLbl = Label(root, text=fKSContent, font=12, pady=10)
    fKSLbl.pack()

    avgSentenceLen = int(len(text.split()) / textstat.sentence_count(text))
    aSLAll = fleschIndex.avg_sentence_len()
    aSLAll.sort(reverse=True)
    print(aSLAll)
    percentile = 100
    for i in range(len(aSLAll)):
        if avgSentenceLen > aSLAll[i]:
            percentile = int(100 * (i / len(aSLAll)))
            break
    aSLContent = "Average Sentence Length is: {},\n which is in the {}th percentile.".format(avgSentenceLen, percentile)
    aSLLbl = Label(root, text=aSLContent, font=12, pady=10)
    aSLLbl.pack()

    predictedScore = predict_score(text)
    predictedScoreContent = "Regression Model Predicted Score is: {}/5.".format("{0:.2f}".format(predictedScore))
    pSLbl = Label(root, text=predictedScoreContent, font=15, pady=15, fg="blue")
    pSLbl.pack()
def lisibilty(text):

    f_lis = ([
        textstat.syllable_count(str(text), lang='en_arabic'),
        textstat.lexicon_count(str(text), removepunct=True),
        textstat.sentence_count(str(text)),
        textstat.flesch_reading_ease(str(text)),
        textstat.flesch_kincaid_grade(str(text)),
        textstat.gunning_fog(str(text)),
        textstat.smog_index(str(text)),
        textstat.automated_readability_index(str(text)),
        textstat.coleman_liau_index(str(text)),
        textstat.linsear_write_formula(str(text)),
        textstat.dale_chall_readability_score(str(text))
    ])
    return f_lis
def get_desc_data(string):
    '''
    Input: book description string
    Output: returns desc_len, num_unique_words, avg_word_len
    '''
    #Data before text processing
    desc_semantic = get_semantic(string)
    syl_count = syllable_count(string)
    lex_count = lexicon_count(string)
    sent_count = sentence_count(string)
    flesch = flesch_reading_ease(string)

    #Data after text processing
    string = text_preprocess(string)
    word_cnt = word_count(string)
    description_len = desc_len(string)
    number_unique_words = num_unique_words(string)
    average_word_len = avg_word_len(string)
    return desc_semantic, word_cnt, description_len, number_unique_words, \
           average_word_len, syl_count, lex_count, sent_count, flesch
예제 #9
0
def process_datum(datum):
    # Remove tags
    soup = BeautifulSoup(datum["Content"], features="html.parser")

    clean_soup = BeautifulSoup(datum["Content"], features="html.parser")
    for elm in clean_soup(["code"]):
        elm.extract()

    body_text = clean_soup.get_text()

    pos_tags = pos_tag(word_tokenize(body_text))

    pos_counts = Counter([tag for word, tag in pos_tags])
    # preterm_counts =

    result = {}
    result['TEXT'] = body_text
    result['CT1'] = lexicon_count(body_text)
    result['CT2'] = sentence_count(body_text)
    for tag in POS_TAGS:
        result['CT3.' + tag] = pos_counts[tag]
    # for preterm in PRETERMINALS:
    # results['CT4.' + preterm] =
    result['CN1'] = len(soup.find_all("code", href=True)) +\
        len(soup.find_all("img", href=True)) +\
        len(soup.findAll("span", {"class": "math-container"}))
    result['CN2'] = len(soup.find_all("a", href=True))
    result['U1.SUM'] = datum['U1.SUM']
    result['U1.1'] = datum['U1.1']
    result['U1.2'] = datum['U1.2']
    result['U2'] = datum['U2']
    result['Y1'] = datum['Y1']
    result['Y2'] = datum['Y2']
    result['T'] = datum['T']
    result['S'] = datum['S']
    result['D'] = datum['D']

    return result
예제 #10
0
def convert_txt_file(window,
                     inputFilename,
                     inputDir,
                     outputDir,
                     openOutputFiles,
                     excludeStopWords=True,
                     lemmatizeWords=True):
    filesToOpen = []
    outputFilename = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.txt', 'corpus', 'lemma_stw')
    filesToOpen.append(outputFilename)

    inputDocs = IO_files_util.getFileList(inputFilename,
                                          inputDir,
                                          fileType='.txt')

    Ndocs = str(len(inputDocs))

    IO_user_interface_util.timed_alert(
        GUI_util.window, 2000, 'Analysis start',
        'Started running txt conversion (lemmatization & stopwords) at', True)

    with open(outputFilename,
              'w',
              encoding='utf-8',
              errors='ignore',
              newline='') as outfile:
        #print("Number of corpus text documents: ",Ndocs)
        #currentLine.append([Ndocs])
        index = 0
        for doc in inputDocs:
            head, tail = os.path.split(doc)
            index = index + 1
            # currentLine.append([index])
            print("Processing file " + str(index) + "/" + str(Ndocs) + " " +
                  tail)
            fullText = (open(doc, "r", encoding="utf-8",
                             errors="ignore").read())

            Nsentences = str(textstat.sentence_count(fullText))
            #print('TOTAL number of sentences: ',Nsentences)

            Nwords = str(textstat.lexicon_count(fullText, removepunct=True))
            #print('TOTAL number of words: ',Nwords)

            Nsyllables = textstat.syllable_count(fullText, lang='en_US')
            #print('TOTAL number of Syllables: ',Nsyllables)

            # words = fullText.split()
            words = nltk.word_tokenize(fullText)

            if excludeStopWords:
                words = excludeStopWords_list(words)

            if lemmatizeWords:
                lemmatizer = WordNetLemmatizer()
                text_vocab = set(
                    lemmatizer.lemmatize(w.lower())
                    for w in fullText.split(" ") if w.isalpha())
                words = set(
                    lemmatizing(w.lower()) for w in words
                    if w.isalpha())  # fullText.split(" ") if w.isalpha())
예제 #11
0
        file_name_xl = './' + file[:-4] + ' - Clause Analysis.xls'
        print('working on:', file)

        #Read in CSV
        df = pd.read_csv('./Data/' + file, header=0)
        text_data = np.array(df.iloc[:, 3])
        pod_id = np.array(df.iloc[:, 0])
        speaker_id = np.array(df.iloc[:, 2])
        sentence_complexity = []
        counted_clauses = []
        instance_ids = []
        for i, instance in enumerate(text_data):
            # calculate sentence complexity
            no_clauses = count_clauses(instance)
            counted_clauses.append(no_clauses)
            sentence_complexity.append(no_clauses/textstat.sentence_count(instance))

            inst_id = str(pod_id[i]) + '_' + str(speaker_id[i])
            instance_ids.append(inst_id)

        # make dictionary of results
        clause_analysis = {
            'instance' : instance_ids,
            'number of clauses' : counted_clauses,
            'sentence complexity' : sentence_complexity
        }

        # export to excel sheet
        new_df = pd.DataFrame(clause_analysis, columns=['instance', 'number of clauses', 'sentence complexity'])
        new_df.to_excel(file_name_xl, index=False, sheet_name='Clause')
예제 #12
0
def download(request):
    global tweetsList

    response = HttpResponse(content_type='application/x-download')
    response['Content-Disposition'] = 'attachment; filename="tweets.csv"'

    #set headers of csv
    fieldnames = ['datetime', 'last updated', 'original username', 'original screen name',
                  'original user location', 'original user verified', 'retweet', 'retweeter username',
                  'retweeter screen name', 'retweeter location', 'retweeter verified', 'text', 'comment',
                  # 'hashtags', 'urls', '#retweets','#favorites', '#retweets of retweet',
                  'hashtags', 'urls', '#retweets', '#favorites',
                  '#favorites of retweet', 'original syllable count', 'original lexicon count',
                  'original sentence count', 'original flesch reading ease score', 'original flesch-kincaid grade level',
                  'original fog scale', 'original smog index', 'original automated readability index', 'original coleman-liau index',
                  'original linsear write level', 'original dale-chall readability score', 'original difficult words',
                  'original readability consensus', 'original neg sentiment', 'original neu sentiment', 'original pos sentiment',
                  'original overall sentiment', 'comment syllable count', 'comment lexicon count',
                  'comment sentence count', 'comment flesch reading ease score', 'comment flesch-kincaid grade level',
                  'comment fog scale', 'comment smog index', 'comment automated readability index', 'comment coleman-liau index',
                  'comment linsear write level', 'comment dale-chall readability score', 'comment difficult words',
                  'comment readability consensus', 'comment neg sentiment', 'comment neu sentiment', 'comment pos sentiment',
                  'comment overall sentiment', 'combined syllable count', 'combined lexicon count',
                  'combined sentence count', 'combined flesch reading ease score', 'combined flesch-kincaid grade level',
                  'combined fog scale', 'combined smog index', 'combined automated readability index', 'combined coleman-liau index',
                  'combined linsear write level', 'combined dale-chall readability score', 'combined difficult words',
                  'combined readability consensus', 'combined neg sentiment', 'combined neu sentiment', 'combined pos sentiment',
                  'combined overall sentiment', 'twitter users query', 'twitter excluded users query', 'twitter hashtags query', 'twitter keywords query',
                  'twitter from date query', 'twitter to date query']

    writer = csv.writer(response, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(fieldnames)

    for tweet in tweetsList:
        #combine hashtags of tweet into string separated by commas
        hashtagString = ""
        tweetHashtags = HashtagLog.objects.filter(tweet__id=tweet.id)
        for i in range(len(tweetHashtags)):
            if i == 0:
                hashtagString += tweetHashtags[i].hashtag.hashtagText
            else:
                hashtagString += ", " + tweetHashtags[i].hashtag.hashtagText

        #combine urls of tweet into string separated by commas
        urlString = ""
        tweetUrls = UrlLog.objects.filter(tweet__id=tweet.id)
        for i in range(len(tweetUrls)):
            if i == 0:
                urlString += tweetUrls[i].url.urlText
            else:
                urlString += ", " + tweetUrls[i].url.urlText

        #display yes or no in verified column for original user
        if tweet.originalUser.isVerified:
            originalVerifiedString = "yes"
        else:
            originalVerifiedString = "no"

        #if not a retweet, new user fields should be empty
        newUsername = None
        newScreenName = None
        newLocation = None
        newVerifiedString = None

        #if retweet:
        #display yes or no in verified column for new user
        if tweet.newUser:
            if tweet.newUser.isVerified:
                newVerifiedString = "yes"
            else:
                newVerifiedString = "no"

            #set retweet fields
            newUsername = tweet.newUser.username
            newScreenName = tweet.newUser.screenName
            newLocation = tweet.newUser.location

        #display yes or no in retweet column
        if tweet.isRetweet:
            isRetweetString = "yes"
        else:
            isRetweetString = "no"

        #get sentiment scores of original text
        sid_obj = SentimentIntensityAnalyzer()
        sentiment_dict_original = sid_obj.polarity_scores(tweet.originalText)

        #combine comment text and original tezt and get sentiment scores for the combination
        commentText = ""
        if tweet.commentText:
            commentText = tweet.commentText
        sentiment_dict_combined = sid_obj.polarity_scores(tweet.originalText + commentText)

        #intialize all comment word processing to empty strings in case there is no comment text
        cSyllableCount = ""
        cLexiconCount = ""
        cSentenceCount = ""
        cFleschReadingEase = ""
        cFleschKincaidGrade = ""
        cGunningFog = ""
        cSmogIndex = ""
        cAutomatedReadabilityIndex = ""
        cColemanLiauIndex = ""
        cLinsearWriteFormula = ""
        cDaleChallReadabilityScore = ""
        cDifficultWords = ""
        cTextStandard = ""

        #if there is comment text, get language processing stats for comment text
        if tweet.commentText != None:
            cSyllableCount = textstat.syllable_count(tweet.commentText, lang='en_US')
            cLexiconCount = textstat.lexicon_count(tweet.commentText, removepunct=True)
            cSentenceCount = textstat.sentence_count(tweet.commentText)
            cFleschReadingEase = textstat.flesch_reading_ease(tweet.commentText)
            cFleschKincaidGrade = textstat.flesch_kincaid_grade(tweet.commentText)
            cGunningFog = textstat.gunning_fog(tweet.commentText)
            cSmogIndex = textstat.smog_index(tweet.commentText)
            cAutomatedReadabilityIndex = textstat.automated_readability_index(tweet.commentText)
            cColemanLiauIndex = textstat.coleman_liau_index(tweet.commentText)
            cLinsearWriteFormula = textstat.linsear_write_formula(tweet.commentText)
            cDaleChallReadabilityScore = textstat.dale_chall_readability_score(tweet.commentText)
            cDifficultWords = textstat.difficult_words(tweet.commentText)
            cTextStandard = textstat.text_standard(tweet.commentText, float_output=False)

        #get sentiment scores for comment text
        cNegSent = ""
        cNeuSent = ""
        cPosSent = ""
        cCompoundSent = ""
        if tweet.commentText:
            sentiment_dict_comment = sid_obj.polarity_scores(tweet.commentText)
            cNegSent = sentiment_dict_comment['neg']
            cNeuSent = sentiment_dict_comment['neu']
            cPosSent = sentiment_dict_comment['pos']
            cCompoundSent = sentiment_dict_comment['compound']

        #write all information about the tweet, and its language processing stats to row in csv
        writer.writerow(
            [tweet.createdAt, tweet.lastUpdated, tweet.originalUser.username,
             tweet.originalUser.screenName, tweet.originalUser.location, originalVerifiedString,
             isRetweetString, newUsername, newScreenName, newLocation, newVerifiedString,
             tweet.originalText, tweet.commentText, hashtagString, urlString, tweet.numRetweetsOriginal,
             # tweet.numFavoritesOriginal, tweet.numRetweetsNew, tweet.numFavoritesNew,
             tweet.numFavoritesOriginal, tweet.numFavoritesNew,
             textstat.syllable_count(tweet.originalText, lang='en_US'),
             textstat.lexicon_count(tweet.originalText, removepunct=True),
             textstat.sentence_count(tweet.originalText),
             textstat.flesch_reading_ease(tweet.originalText),
             textstat.flesch_kincaid_grade(tweet.originalText),
             textstat.gunning_fog(tweet.originalText),
             textstat.smog_index(tweet.originalText),
             textstat.automated_readability_index(tweet.originalText),
             textstat.coleman_liau_index(tweet.originalText),
             textstat.linsear_write_formula(tweet.originalText),
             textstat.dale_chall_readability_score(tweet.originalText),
             textstat.difficult_words(tweet.originalText),
             textstat.text_standard(tweet.originalText, float_output=False),
             sentiment_dict_original['neg'], sentiment_dict_original['neu'],
             sentiment_dict_original['pos'], sentiment_dict_original['compound'], cSyllableCount,
             cLexiconCount, cSentenceCount, cFleschReadingEase, cFleschKincaidGrade, cGunningFog,
             cSmogIndex, cAutomatedReadabilityIndex, cColemanLiauIndex, cLinsearWriteFormula, cDaleChallReadabilityScore,
             cDifficultWords, cTextStandard, cNegSent, cNeuSent, cPosSent, cCompoundSent,
             textstat.syllable_count(tweet.originalText + commentText, lang='en_US'),
             textstat.lexicon_count(tweet.originalText + commentText, removepunct=True),
             textstat.sentence_count(tweet.originalText + commentText),
             textstat.flesch_reading_ease(tweet.originalText + commentText),
             textstat.flesch_kincaid_grade(tweet.originalText + commentText),
             textstat.gunning_fog(tweet.originalText + commentText),
             textstat.smog_index(tweet.originalText + commentText),
             textstat.automated_readability_index(tweet.originalText + commentText),
             textstat.coleman_liau_index(tweet.originalText + commentText),
             textstat.linsear_write_formula(tweet.originalText + commentText),
             textstat.dale_chall_readability_score(tweet.originalText + commentText),
             textstat.difficult_words(tweet.originalText + commentText),
             textstat.text_standard(tweet.originalText + commentText, float_output=False),
             sentiment_dict_combined['neg'], sentiment_dict_combined['neu'],
             sentiment_dict_combined['pos'], sentiment_dict_combined['compound'],
             tweet.twitterQueryUsers, tweet.twitterQueryNotUsers,
             tweet.twitterQueryHashtags, tweet.twitterQueryKeywords,
             tweet.twitterQueryFromDate, tweet.twitterQueryToDate]
        )

    return response
예제 #13
0
def sentence_count(cleanedtext):
    blob = TextBlob(cleanedtext)
    split_text=blob.sentences
    No_Of_Sentences=len(split_text)
    #Initially used this but split_text tends to overcount
    return textstat.sentence_count(cleanedtext)
예제 #14
0
def process_data(contents):
    ############ Word Tokenization
    ## Raw tokens: including punctuations, numbers etc.
    tokens = word_tokenize(contents)

    ## Convert all words into small cases
    ## Keep tokens that purely consist of alphabetic characters only
    ## Delete single-character words except for 'I'
    words = [
        w.lower() for w in tokens if w.isalpha() and len(w) > 1 or w == 'i'
    ]
    print(
        f'contents length:{len(contents)},tokens:{len(tokens)},words:{len(words)}'
    )

    ########### Delete words with lenth smaller than 1% and largr than 99% of the document
    # wordlen99 = np.quantile([len(w) for w in words], 0.99)
    # wordlen1 = np.quantile([len(w) for w in words], 0.01)
    # words = [w for w in words if len(w)<wordlen99 and len(w)>wordlen1]
    vocab = sorted(set(words))

    ########### Save text statistics
    ##### 1. nw 2. nvocab 3. nsyllable 4.nsentence 5. tone 6. readability

    ## 1. nw
    nw.append(len(words))

    ## 2. nvocab
    nvocab.append(len(vocab))

    ## 3. syllable
    nsyllable.append(textstat.syllable_count(contents))

    ## 4. sentence
    nsentence.append(textstat.sentence_count(contents))

    ## 5. tone
    ### LM dictionary
    # n3=count_occurrence3(words, lm_neg)
    # n1=count_occurrence(words, lm_neg)
    # # n2=count_occurrence2(set(words), set(lm_neg))
    # if n1!=n3:
    #     print('EEEEEEEEEEEEEEEEEEEEEEEror')
    #     exit()
    # n_neg_lm.append(n1)

    n_neg_lm.append(count_occurrence3(words, lm_neg))
    n_pos_lm.append(count_occurrence3(words, lm_pos))
    n_uctt_lm.append(count_occurrence3(words, lm_uctt))
    n_lit_lm.append(count_occurrence3(words, lm_lit))
    #         n_cstr_lm.append(count_occurrence(words, lm_cstr))
    #         n_modal1_lm.append(count_occurrence(words, lm_modal1))
    #         n_modal2_lm.append(count_occurrence(words, lm_modal2))
    #         n_modal3_lm.append(count_occurrence(words, lm_modal3))

    # n_negation_lm.append(count_negation(words, lm_pos, gt_negation))
    # n1=count_negation(words, lm_pos, gt_negation)
    # n2=count_negation3(words, lm_pos, gt_negation)
    # if n1 != n2:
    #     print('EEEEEEEEEEEEEEEEEEEEEEEEEEor')
    #     exit()
    # n_negation_lm.append(n1)

    n_negation_lm.append(count_negation3(words, lm_pos, gt_negation))

    ### General Inquirer dictionary
    n_neg_gi.append(count_occurrence3(words, gi_neg))
    n_pos_gi.append(count_occurrence3(words, gi_pos))
    n_negation_gi.append(count_negation3(words, gi_pos, gt_negation))
    ### Henry dictionary
    n_neg_hr.append(count_occurrence3(words, hr_neg))
    n_pos_hr.append(count_occurrence3(words, hr_pos))
    n_negation_hr.append(count_negation3(words, gi_pos, gt_negation))

    ## 4. readability
    fre_i = textstat.flesch_reading_ease(contents)
    if fre_i > 100:
        fre_i = 100
    if fre_i < 0:
        fre_i = float('NaN')
    fre.append(fre_i)

    fkg_i = textstat.flesch_kincaid_grade(contents)
    if fkg_i < 0:
        fkg_i = float('NaN')
    fkg.append(fkg_i)
    #RIX
    cl_i = textstat.coleman_liau_index(contents)
    if cl_i < 0:
        cl_i = float('NaN')
    cl.append(cl_i)

    fog.append(textstat.gunning_fog(contents))
    ari.append(textstat.automated_readability_index(contents))
    smog.append(textstat.smog_index(contents))
예제 #15
0
def test_sentence_count():
    count = textstat.sentence_count(long_test)

    assert count == 16
예제 #16
0
def compute_corpus_statistics(window,
                              inputFilename,
                              inputDir,
                              outputDir,
                              openOutputFiles,
                              createExcelCharts,
                              excludeStopWords=True,
                              lemmatizeWords=True):
    filesToOpen = []
    outputFilenameCSV = IO_files_util.generate_output_file_name(
        inputFilename, inputDir, outputDir, '.csv', 'corpus', 'stats')
    filesToOpen.append(outputFilenameCSV)
    inputDocs = IO_files_util.getFileList(inputFilename,
                                          inputDir,
                                          fileType='.txt')

    # read_line(inputFilename, inputDir, outputDir)
    # return

    Ndocs = str(len(inputDocs))
    fieldnames = [
        'Number of documents in corpus', 'Document ID', 'Document',
        'Number of Sentences in Document', 'Number of Words in Document',
        'Number of Syllables in Document', 'Word1', 'Frequency1', 'Word2',
        'Frequency2', 'Word3', 'Frequency3', 'Word4', 'Frequency4', 'Word5',
        'Frequency5', 'Word6', 'Frequency6', 'Word7', 'Frequency7', 'Word8',
        'Frequency8', 'Word9', 'Frequency9', 'Word10', 'Frequency10', 'Word11',
        'Frequency11', 'Word12', 'Frequency12', 'Word13', 'Frequency13',
        'Word14', 'Frequency14', 'Word15', 'Frequency15', 'Word16',
        'Frequency16', 'Word17', 'Frequency17', 'Word18', 'Frequency18',
        'Word19', 'Frequency19', 'Word20', 'Frequency20'
    ]
    if IO_csv_util.openCSVOutputFile(outputFilenameCSV):
        return

    IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start',
                                       'Started running corpus statistics at',
                                       True)

    with open(outputFilenameCSV,
              'w',
              encoding='utf-8',
              errors='ignore',
              newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        #print("Number of corpus text documents: ",Ndocs)
        #currentLine.append([Ndocs])
        index = 0
        for doc in inputDocs:
            head, tail = os.path.split(doc)
            index = index + 1
            # currentLine.append([index])
            print("Processing file " + str(index) + "/" + str(Ndocs) + " " +
                  tail)
            #currentLine.append([doc])
            fullText = (open(doc, "r", encoding="utf-8",
                             errors="ignore").read())

            Nsentences = str(textstat.sentence_count(fullText))
            #print('TOTAL number of sentences: ',Nsentences)

            Nwords = str(textstat.lexicon_count(fullText, removepunct=True))
            #print('TOTAL number of words: ',Nwords)

            Nsyllables = textstat.syllable_count(fullText, lang='en_US')
            #print('TOTAL number of Syllables: ',Nsyllables)

            # words = fullText.split()
            words = nltk.word_tokenize(fullText)

            if excludeStopWords:
                words = excludeStopWords_list(words)

            if lemmatizeWords:
                lemmatizer = WordNetLemmatizer()
                text_vocab = set(
                    lemmatizer.lemmatize(w.lower())
                    for w in fullText.split(" ") if w.isalpha())
                words = set(
                    lemmatizing(w.lower()) for w in words
                    if w.isalpha())  # fullText.split(" ") if w.isalpha())

            word_counts = Counter(words)
            # 20 most frequent words
            #print("\n\nTOP 20 most frequent words  ----------------------------")
            # for item in word_counts.most_common(20):
            #     print(item)
            # currentLine=[[Ndocs,index,doc,Nsentences,Nwords,Nsyllables]]
            currentLine = [[
                Ndocs, index,
                IO_csv_util.dressFilenameForCSVHyperlink(doc), Nsentences,
                Nwords, Nsyllables
            ]]
            for item in word_counts.most_common(20):
                currentLine[0].append(item[0])  # word
                currentLine[0].append(item[1])  # frequency
            writer = csv.writer(csvfile)
            writer.writerows(currentLine)
        csvfile.close()

        # compute statistics about doc length grouped by Document
        list = ['Document ID']
        tempOutputfile = statistics_csv_util.compute_field_statistics_groupBy(
            window, outputFilenameCSV, outputDir, list, openOutputFiles,
            createExcelCharts, 4)
        # ,4)  # 'number of words in doc'
        if tempOutputfile != None:
            filesToOpen.extend(tempOutputfile)

        IO_user_interface_util.timed_alert(
            GUI_util.window, 2000, 'Analysis end',
            'Finished running corpus statistics at', True)

        if createExcelCharts == True:
            columns_to_be_plotted = [[1, 3], [1, 4]]
            hover_label = ['Document', 'Document']
            inputFilename = outputFilenameCSV
            Excel_outputFilename = Excel_util.run_all(
                columns_to_be_plotted,
                inputFilename,
                outputDir,
                outputFileLabel='corpus_stats',
                chart_type_list=["bar"],
                # chart_title='Corpus statistics\nCorpus directory: '+inputDir,
                chart_title=
                'Corpus Statistics: Frequency of Sentences & Words by Document',
                column_xAxis_label_var='Document',
                hover_info_column_list=hover_label)
            if Excel_outputFilename != "":
                filesToOpen.append(Excel_outputFilename)

        # TODO
        #   we should create 10 classes of values by distance to the median of
        #       each value in the Number of Words in Document Col. E
        #   -0-10 11-20 21-30,… 91-100
        #   and plot them as column charts.

        if openOutputFiles == True:
            IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles,
                                          filesToOpen)
    return filesToOpen
예제 #17
0
def avg_sentence_length(text): 
    words = word_count(text) 
    sentences = textstat.sentence_count(text)
    average_sentence_length = float(words / sentences) 
    return average_sentence_length
예제 #18
0
    instance_ids = []
    ttr = []
    mtld = []
    number_of_words = []
    readability = []
    unique_words = []
    avg_sentence_length = []
    avg_word_length = []
    for i, instance in enumerate(text_data):
        # lexical richness instance for entry before lemmatization and stopword removal
        lex_with_stopwords = LexicalRichness(instance)
        number_of_words.append(lex_with_stopwords.words)

        # mean sentence length
        mean_sentence_len = int(lex_with_stopwords.words /
                                textstat.sentence_count(instance))
        avg_sentence_length.append(mean_sentence_len)

        # mean word length
        num_chars = sum([len(w) for w in tokenizer.tokenize(instance)])
        mean_word_len = round(num_chars / lex_with_stopwords.words, 1)
        avg_word_length.append(mean_word_len)

        # readability
        readability.append(textstat.flesch_reading_ease(instance))

        # remove stopwords & lemmatize
        lemmatizer = WordNetLemmatizer()
        instance_no_stopwords = remove_stopwords(instance)
        new_instance = ' '.join([
            lemmatizer.lemmatize(w)
def compute_syntactic(df, col):
    for ind, row in df.iterrows():
        df.loc[ind, 'syllable_count'] = textstat.syllable_count(str(row[col]))
        #df.loc[ind,'lexicon_count'] = textstat.lexicon_count(str(row[col]), removepunct=True)
        df.loc[ind, 'sentence_count'] = textstat.sentence_count(str(row[col]))
    return df
예제 #20
0
def get_redability_assessments(data_text: str) -> Optional[dict]:
    divided_text = tokenize.sent_tokenize(data_text)
    word_tokenizes = nltk.word_tokenize(data_text)
    pos_tags = nltk.pos_tag(word_tokenizes)
    pos_tags_tagger = TAGGER.tag(word_tokenizes)
    f_dist = nltk.FreqDist(word_tokenizes)

    uniqueWordCount = compute_unique_word_count(f_dist.most_common())

    paragraphCount = max(len(data_text.split('\n')), len(data_text.split('\r\n')))

    counts = Counter(tag for word, tag in pos_tags)

    # Readability Grade Levels
    readability_grade_levels = dict(fleschKincaid=0, gunningFog=0, colemanLiau=0, smog=0,
                                    ari=0, forecastGradeLevel=0, powersSumnerKearlGrade=0, rix=0,
                                    raygorReadability=0, fryReadability=0, flesch=0)

    readability_grade_levels.update(fleschKincaid=textstat.flesch_kincaid_grade(data_text))
    readability_grade_levels.update(gunningFog=textstat.gunning_fog(data_text))
    readability_grade_levels.update(colemanLiau=textstat.coleman_liau_index(data_text))
    readability_grade_levels.update(smog=textstat.smog_index(data_text))
    readability_grade_levels.update(ari=textstat.automated_readability_index(data_text))
    readability_grade_levels.update(rix=textstat.rix(data_text))

    # need to check
    readability_grade_levels.update(forcastGradeLevel=round(20 - (textstat.avg_syllables_per_word(data_text) / 10), 2))

    readability_grade_levels.update(powersSumnerKearlGrade=round(textstat.avg_sentence_length(data_text) +
                                                                 textstat.avg_syllables_per_word(data_text) +
                                                                 2.7971, 2))
    readability_grade_levels.update(raygorReadability=count_raygor_readability(divided_text))
    readability_grade_levels.update(fryReadability=count_fry_readability(divided_text))
    # need to check

    readability_grade_levels.update(flesch=textstat.flesch_reading_ease(data_text))

    # Readability Scores
    readability_scores = dict(readableRating="", fleschReadingEase=0, cefrLevel='', ieltsLevel='', spacheScore=0,
                              newDaleChallScore=0, lixReadability=0, lensearWrite=0)
    readability_scores.update(readableRating=count_average_grade_levels(readability_grade_levels))
    readability_scores.update(fleschReadingEase=textstat.flesch_reading_ease(data_text))
    readability_scores.update(cefrLevel=count_cefr_levels(readability_grade_levels))
    readability_scores.update(ieltsLevel=count_ielts_levels(readability_grade_levels))
    readability_scores.update(spacheScore=round(textstat.spache_readability(data_text), 2))
    readability_scores.update(newDaleChallScore=textstat.dale_chall_readability_score_v2(data_text))
    readability_scores.update(lixReadability=textstat.lix(data_text))
    readability_scores.update(lensearWrite=textstat.linsear_write_formula(data_text))

    # Text Statistics
    text_statistics = dict(characterCount=0, syllableCount=0, wordCount=0, uniqueWordCount=0,
                           sentenceCount=0, paragraphCount=0)
    text_statistics.update(characterCount=textstat.char_count(data_text))
    text_statistics.update(syllableCount=textstat.syllable_count(data_text))
    text_statistics.update(wordCount=textstat.lexicon_count(data_text))
    text_statistics.update(uniqueWordCount=uniqueWordCount)
    text_statistics.update(sentenceCount=textstat.sentence_count(data_text))
    text_statistics.update(paragraphCount=paragraphCount)

    # Timings
    timings_statistics = dict(readingTime=0, speakingTime=0)
    timings_statistics.update(readingTime=reading_time(textstat.lexicon_count(data_text)))
    timings_statistics.update(speakingTime=speaking_time(textstat.lexicon_count(data_text)))

    # Text Composition
    text_composition = dict(adjectives=0, adverbs=0, conjunctions=0, determiners=0, interjections=0, nouns=0, verbs=0,
                            properNouns=0, prepositions=0, pronouns=0, qualifiers=0, unrecognised=0, nonWords=0)

    text_composition.update(adjectives=counts.get('JJ', 0) + counts.get('JJR', 0) + counts.get('JJS', 0))
    text_composition.update(adverbs=counts.get('RB', 0) + counts.get('RBR', 0) + counts.get('RBS', 0))
    text_composition.update(conjunctions=counts.get('CC', 0))
    text_composition.update(determiners=counts.get('DT', 0) + counts.get('PDT', 0) + counts.get('WDT', 0))
    text_composition.update(interjections=counts.get('UH', 0))
    text_composition.update(nouns=counts.get('NN', 0) + counts.get('NNS', 0))
    text_composition.update(
        verbs=counts.get('VB', 0) + counts.get('VBD', 0) + counts.get('VBG', 0) + counts.get('VBN', 0) + counts.get(
            'VBP', 0) + counts.get('VBZ', 0))
    text_composition.update(properNouns=counts.get('NNP', 0) + counts.get('NNPS', 0))
    text_composition.update(prepositions=counts.get('IN', 0))
    text_composition.update(
        pronouns=counts.get('PRP', 0) + counts.get('PRP$', 0) + counts.get('WP', 0) + counts.get('WP$', 0))
    text_composition.update(qualifiers=counts.get('RB', 0))
    text_composition.update(unrecognised=counts.get(None, 0))
    text_composition.update(nonWords=counts.get('.', 0) + counts.get(',', 0) + counts.get(':', 0))

    # Readability Issues
    text_readability_issues = dict(sentences30SyllablesCount=0, sentences20SyllablesCount=0,
                                   sentences30Syllables=[], sentences20Syllables=[],
                                   words4SyllablesCount=0, words12LettersCount=0,
                                   words4Syllables=[], words12Letters=[])

    sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count = count_sentences_syllables(
        divided_text)

    sentences_30_syllables = find_limit_offcet(data_text, sentences_30_syllables,
                                               "sentences_30_syllables",
                                               "sentences_30_syllables",
                                               "This sentence has more than 30 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.",
                                               "Readability Issues")
    sentences_20_syllables = find_limit_offcet(data_text, sentences_20_syllables,
                                               "sentences_20_syllables",
                                               "sentences_20_syllables",
                                               "This sentence has more than 20 syllables. Consider rewriting it to be shorter or splitting it into smaller sentences.",
                                               "Readability Issues")

    text_readability_issues.update(sentences30SyllablesCount=sentences_30_count,
                                   sentences20SyllablesCount=sentences_20_count)

    words_12_letters, words_12_count, words_4_syllables, words_4_count = words_sentence_syllables(divided_text)

    words_12_letters = find_limit_offcet(data_text, words_12_letters,
                                         "words_12_letters",
                                         "words_12_letters",
                                         "This word is more than 12 letters",
                                         "Readability Issues")
    words_4_syllables = find_limit_offcet(data_text, words_4_syllables,
                                          "words_4_syllables",
                                          "words_4_syllables",
                                          "This word is more than 4 syllables",
                                          "Readability Issues")

    text_readability_issues.update(words4SyllablesCount=words_4_count,
                                   words12LettersCount=words_12_count)

    # Writing Style Issues
    text_style_issues = dict(passiveVoiceCount=0, passiveVoices=[],
                             adverbsCount=0, adverbs=[],
                             clicheCount=0, cliches=[])
    passive_voises_return = find_passives(divided_text)
    passive_voises_return = find_limit_offcet(data_text, passive_voises_return,
                                              "passive_voises",
                                              "passive_voises",
                                              "Too much of using passive voises",
                                              "Writing Style Issues")
    adverbs_return = find_adverbs(pos_tags_tagger)
    adverbs_return = find_limit_offcet(data_text, adverbs_return,
                                       "adverbs",  # writing_style_issues
                                       "adverbs",
                                       "Too much of using adverbs",
                                       "Writing Style Issues")
    text_style_issues.update(passiveVoiceCount=len(passive_voises_return),
                             adverbsCount=len(adverbs_return))

    # Text Density Issues
    text_density_issues = dict(charactersPerWord=0, syllablesPerWord=0, wordsPerSentence=0,
                               wordsPerParagraph=0, sentencesPerParagraph=0)

    text_density_issues.update(charactersPerWord=textstat.avg_character_per_word(data_text),
                               syllablesPerWord=textstat.avg_syllables_per_word(data_text),
                               wordsPerSentence=round(textstat.lexicon_count(data_text) / len(divided_text), 2),
                               wordsPerParagraph=round(textstat.lexicon_count(data_text) / paragraphCount, 2),
                               sentencesPerParagraph=round(len(divided_text) / paragraphCount, 2))

    # Language Issues
    text_language_issues = dict(spellingIssuesCount=0, grammarIssueCount=0)

    matches_limit_offcet = sentences_20_syllables + sentences_30_syllables + words_4_syllables + words_12_letters + \
                           passive_voises_return + adverbs_return

    return dict(readabilityGradeLevels=readability_grade_levels,
                readabilityScores=readability_scores,
                textStatistics=text_statistics,
                timings=timings_statistics,
                textComposition=text_composition,
                textReadabilityIssues=text_readability_issues,
                textStyleIssues=text_style_issues,
                textDensityIssues=text_density_issues,
                textLanguageIssues=text_language_issues,
                matches=matches_limit_offcet)
예제 #21
0



pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity


df_clean_train['subjectivity'] = df_clean_train['Sentence'].apply(sub)
df_clean_train['polar'] = df_clean_train['Sentence'].apply(pol)
df_clean_train['Polarity'] = df_train['Polarity']
df_clean_train['review_len'] = df_clean_train['Sentence'].astype(str).apply(len)
df_clean_train['word_count'] = df_clean_train['Sentence'].apply(lambda x: len(str(x).split()))
df_clean_train['syllable_count'] =  df_clean_train['Sentence'].apply(lambda x: textstat.syllable_count(x))
df_clean_train['lexicon_count'] =  df_clean_train['Sentence'].apply(lambda x: textstat.lexicon_count(x))
df_clean_train['sentence_count'] =  df_clean_train['Sentence'].apply(lambda x: textstat.sentence_count(x))
df_clean_train['flesch_reading_ease'] =  df_clean_train['Sentence'].apply(lambda x: textstat.flesch_reading_ease(x))
df_clean_train['flesch_kincaid_grade'] =  df_clean_train['Sentence'].apply(lambda x: textstat.flesch_kincaid_grade(x))
df_clean_train['gunning_fog'] =  df_clean_train['Sentence'].apply(lambda x: textstat.gunning_fog(x))

df_clean_train.head()

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=.02, max_df=.5, ngram_range=[1,3], max_features=500, stop_words='english')
dtm_tfidf = tfidf_vectorizer.fit_transform(df_clean_train['Sentence'])

bow_df_tfidf = pd.DataFrame(dtm_tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names(), index=df_clean_train.index)
bow_df_tfidf.shape

df_bow_tfidf = pd.concat([df_clean_train, bow_df_tfidf], axis=1)
예제 #22
0
def main(dir: str):
    checker = language_tool_python.LanguageTool('en-US')
    emails = {}
    totalWords = ''

    filenames = [
        filename for filename in os.listdir(dir) if filename.endswith('.eml')
    ]
    for filename in filenames:
        print()
        print('[INFO] Processing {}...'.format(filename))

        with open(os.path.join(dir, filename), 'r', encoding='latin1') as file:
            try:
                mail = mailparser.parse_from_file_obj(file)
            except Exception as e:
                print('[WARNING] Error while parsing: {}'.format(e))
                continue
            # filter duplicates based on subject
            #if mail.subject in emails:
            #    print('[WARNING] This email seems to be a duplicate of "{}"! Skipping...'
            #        .format(emails[mail.subject]['filename']))
            #    continue

            # don't process if auth results missing
            # if 'Authentication-Results' not in mail.headers:
            #     print('[WARNING] This email is missing an authentication results header! Skipping...')
            #     continue

            attachments = ''
            for attachment in mail.attachments:
                attachment['filename'] = re.sub(r'<|>', '',
                                                attachment['filename'])
            try:
                mail.write_attachments(dir)
                for attachment in mail.attachments:
                    if re.search('image', attachment['mail_content_type']):
                        if re.search('gif', attachment['mail_content_type']):
                            images, _, _ = gif2numpy.convert(
                                dir + '\\' + attachment['filename'])
                            img = images[0]
                        else:
                            img = cv2.imread(dir + '\\' +
                                             attachment['filename'])
                        img = cv2.resize(img,
                                         None,
                                         fx=1.2,
                                         fy=1.2,
                                         interpolation=cv2.INTER_CUBIC)
                        text = pytesseract.image_to_string(img)
                        attachments += text
                    elif re.search('pdf', attachment['mail_content_type']):
                        encoding = chardet.detect(
                            pdf_to_text(dir + '\\' +
                                        attachment['filename']))['encoding']
                        attachments += pdf_to_text(
                            dir + '\\' +
                            attachment['filename']).decode(encoding)
                    # elif re.search('text', attachment['mail_content_type']):
                    #     #print(chardet.detect((attachment['payload']).encode()))
                    #     #encoding = chardet.detect(base64.b64decode(attachment['payload']).encode())['encoding']
                    #     #attachments += base64.b64decode(attachment['payload']).decode(encoding)
                    #     #print(codecs.encode(base64.b64decode(attachment['payload']), encoding=attachment['content_transfer_encoding']))
                    #     attachments += attachment['payload']
                    else:
                        attachments += attachment['payload']
                    os.remove(dir + '\\' + attachment['filename'])
            except Exception as e:
                print(
                    '[WARNING] Error while parsing attachments: {}'.format(e))
                [
                    os.remove(dir + '\\' + attachment['filename'])
                    for attachment in mail.attachments
                ]

            body = mail.subject + ' ' + \
                   remove_noise(BeautifulSoup(mail.body, 'lxml').get_text(separator=' ', strip=True) +
                                BeautifulSoup(attachments, 'lxml').get_text())
            blob = TextBlob(body)
            totalWords = totalWords + " " + body.lower()
            grammarErrors = checker.check(body)

            if 'Authentication-Results' in mail.headers:
                spf = re.findall('spf=(\S*)',
                                 mail.headers['Authentication-Results'])
                dkim = re.findall('dkim=(\S*)',
                                  mail.headers['Authentication-Results'])
                dmarc = re.findall('dmarc=(\S*)',
                                   mail.headers['Authentication-Results'])
            else:
                spf = dkim = dmarc = ''

            emails[filename] = {
                'filename': filename,
                # 'hops': mail.received[-1]['hop'],
                # 'totalDelay': sum([hop['delay']/60 for hop in mail.received]),
                'spf': spf[0] if len(spf) else None,
                'dkim': dkim[0] if len(dkim) else None,
                'dmarc': dmarc[0] if len(dmarc) else None,
                'subject': mail.subject,
                'from': mail.from_[0][1],
                'to': [tup[1] for tup in mail.to],
                'replyTo': [tup[1] for tup in mail.reply_to],
                'attachments': [x['filename'] for x in mail.attachments],
                'grammarErrors': len(grammarErrors),
                'counts': {
                    'characterCount': len(body),
                    'wordCount': textstat.lexicon_count(body),
                    'sentenceCount': textstat.sentence_count(body)
                },
                'readability': {
                    'flesch_kincaid':
                    textstat.flesch_kincaid_grade(body),
                    'gunning_fog':
                    textstat.gunning_fog(body),
                    'smog_index':
                    textstat.smog_index(body),
                    'automated_readability_index':
                    textstat.automated_readability_index(body),
                    'coleman_liau_index':
                    textstat.coleman_liau_index(body),
                    'linsear_write':
                    textstat.linsear_write_formula(body),
                },
                'sentiment': {
                    'polarity': blob.sentiment.polarity,
                    'subjectivity': blob.sentiment.subjectivity
                }
            }

            if save_body:
                emails[filename]['body'] = body

    ## quit if nothing found ##
    # if not emails:
    #     print('[WARNING] No files were found in "{}"!'.format(dir))
    #     return

    ## writing all words to file ##
    with open(os.path.join(dir, 'words.txt'), 'w', encoding='utf-8') as file:
        file.write(totalWords.lower())

    ## output json ##
    with open(os.path.join(dir, 'analysis.json'), 'w') as jsonFile:
        json.dump(emails, jsonFile, indent=2)

    ## build and output csv ##

    # generate and output headers using first email
    column_headers = list(flatten_json(emails[list(emails.keys())[0]]).keys())
    csvFile = open(os.path.join(dir, 'analysis.csv'), 'w', encoding='utf-8')
    csvFile.write(',{}\n'.format(','.join(column_headers)))

    # generate and output one line per email
    for email in emails.keys():
        # flatten json to 1 layer deep
        flattened_email = flatten_json(emails[email])
        # generate the values for this row
        csv_values = [
            '"' + str(flattened_email[column_header]) + '"'
            for column_header in column_headers
        ]
        # add email name and join w/ commas, then write out
        csvFile.write('{},{}\n'.format('"' + email + '"',
                                       ','.join(csv_values)))

    csvFile.close()

    # print out stats
    print('{}/{} processed. The remaining failed for some reason.'.format(
        len(emails), len(filenames)))
def sentence_count(text):
	return textstat.sentence_count(text)
예제 #24
0
signal.signal(signal.SIGINT, handler)

d = "/home/adulau/dess/12/01"
ld = os.listdir(d)
stats = {}
stats['hits'] = 0
stats['miss'] = 0

for f in ld:
    currentfile = os.path.join(d, f)
    with gzip.open(currentfile) as paste:
        content = paste.read().decode('utf-8')
        lexicon = textstat.lexicon_count(content, removepunct=True)
        syllabe = textstat.syllable_count(content, lang='en_US')
        sentence = textstat.sentence_count(content)
        # consensus = textstat.text_standard(content, float_output=False)
        # print ("sentence={}, syllabe={}, lexicon={}, flesch_reading_score={},{}".format(sentence, syllabe, lexicon, textstat.flesch_reading_ease(content), currentfile))
        analysis = {}
        analysis['sentence'] = sentence
        analysis['syllabe'] = syllabe
        analysis['lexicon'] = lexicon
        analysis['flesch_reading_ease'] = textstat.flesch_reading_ease(content)
        analysis['filename'] = currentfile
        analysis['length'] = len(content)
        analysis['extract'] = content[:100]

        #rank = (analysis['flesch_reading_ease']+analysis['flesch_reading_ease']+analysis['lexicon'])*analysis['sentence']
        rank = analysis['flesch_reading_ease']
        if analysis['flesch_reading_ease'] >= 0 and analysis[
                'flesch_reading_ease'] <= 900:
예제 #25
0
    def test_sentence_count(self):
        count = textstat.sentence_count(self.long_test)

        self.assertEqual(16, count)
예제 #26
0
###############################################################################
# Readability scores: Greta Thunberg

import textstat
import numpy as np

# drop empty text fields
temp = greta.copy()
temp['text'].replace('', np.nan, inplace=True)
temp['text'].replace(' ', np.nan, inplace=True)
temp.dropna(subset=['text'], inplace=True)

temp['syl_count'] = temp.text.apply(lambda x: textstat.syllable_count(x))
temp['word_count'] = temp.text.apply(
    lambda x: textstat.lexicon_count(x, removepunct=True))
temp['sent_count'] = temp.text.apply(lambda x: textstat.sentence_count(x))
temp['score_fre'] = temp.text.apply(lambda x: textstat.flesch_reading_ease(x))
temp['score_are'] = temp.text.apply(
    lambda x: textstat.automated_readability_index(x))
temp['char_count'] = temp.text.apply(lambda x: len(x))

sns.distplot(temp.word_count,
             hist=True,
             kde=False,
             norm_hist=True,
             color='darkblue',
             hist_kws={'edgecolor': 'black'})

fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(nrows=2, ncols=2, figsize=(8, 6))
fig.subplots_adjust(hspace=.5)
sns.despine()
예제 #27
0
파일: test.py 프로젝트: shivam5992/textstat
def test_sentence_count():
    count = textstat.sentence_count(long_test)

    assert count == 16
예제 #28
0
def test_sentence_count():
    textstat.set_lang("en_US")
    count = textstat.sentence_count(long_test)

    assert count == 16
예제 #29
0
    ]
    for token in final_tokens:
        synset = wsd.lesk(sentence, token)
        if not synset is None:
            SynSets.append(synset)
            synsDic[synset] = token

SynSets = set(SynSets)

SynSets = sorted(SynSets)
with open("synsets.txt", "a") as file:
    file.write("\n---------------------\n")
    for synset in SynSets:
        file.write("{} -> {}\n".format(str(synset.__str__()), synsDic[synset]))
file.close()
sentencesCount = textstat.sentence_count(text)
print("Sentences: ", sentencesCount)
prepro = preprocess(text)

# # for i in prepro:
# #     print(i,"\n")

print("Tokens inc:", len(prepro))
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(prepro)
print("Tokens ex:", len(cs))
iob_tagged = tree2conlltags(cs)
ne_tree = ne_chunk(pos_tag(word_tokenize(text)))
nlp = en_core_web_sm.load()
def sentence_count(corpus):
    return np.array([textstat.sentence_count(doc)
                     for doc in corpus]).reshape(-1, 1)
예제 #31
0
#    print(downloaded)
    if url != 0 and downloaded:
        from readability.readability import Document
        from html2text import html2text

        readable_article = Document(downloaded).summary()

        raw = html2text(readable_article)
        print(raw)

        # Lexicon Count - number of words present in the text
        lexicon_count = textstat.lexicon_count(raw, removepunct=True)
        worksheet.update("C" + row, lexicon_count)

        # Sentence Count
        sentence_count = textstat.sentence_count(raw)
        worksheet.update("D" + row, sentence_count)

        # The Flesch Reading Ease formula
        # 90-100 - Very Easy | 80-89 - Easy | 70-79 - Fairly Easy | 60-69 - Standard
        # 50-59 - Fairly Difficult | 30-49 - Difficult | 0-29 - Very Confusing
        flesch_reading_ease = textstat.flesch_reading_ease(raw)
        worksheet.update("E" + row, flesch_reading_ease)

        # Flesch-Kincaid Grade Level
        # https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch%E2%80%93Kincaid_grade_level
        flesch_kincaid_grade = textstat.flesch_kincaid_grade(raw)
        worksheet.update("F" + row, flesch_kincaid_grade)

        # The Fog Scale (Gunning FOG Formula)
        # https://en.wikipedia.org/wiki/Gunning_fog_index