Exemplo n.º 1
0
 def get_readability_scores(self, doc):
     segment = doc.text
     readability_dict = {
         "automated_readability_index":
         textstat.automated_readability_index(segment),
         "coleman_liau_index":
         textstat.coleman_liau_index(segment),
         "dale_chall_readability_score":
         textstat.dale_chall_readability_score(segment),
         "difficult_words":
         textstat.difficult_words(segment),
         "flesch_kincaid_grade":
         textstat.flesch_kincaid_grade(segment),
         "flesch_reading_ease":
         textstat.flesch_reading_ease(segment),
         "gunning_fog":
         textstat.gunning_fog(segment),
         "linsear_write_formula":
         textstat.linsear_write_formula(segment),
         "smog_index":
         textstat.smog_index(segment),
         "text_standard":
         self._convert_text_standard_to_integer(
             textstat.text_standard(segment)),
     }
     return readability_dict
def preprocess_text(text):
    """Takes a text, generate features, and returns as dict

    Args:
        text (str): the text to be preprocessed.

    Returns:
        dict: a dictionary of feature names with associated values

    """
    text = _simplify_punctuation(text)

    features = {
        "flesch_reading_ease": textstat.flesch_reading_ease(text),
        "smog_index": textstat.smog_index(text),
        "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
        "coleman_liau_index": textstat.coleman_liau_index(text),
        "automated_readability_index":
        textstat.automated_readability_index(text),
        "dale_chall_readability_score":
        textstat.dale_chall_readability_score(text),
        "difficult_words": textstat.difficult_words(text),
        "linsear_write_formula": textstat.linsear_write_formula(text),
        "gunning_fog": textstat.gunning_fog(text),
        "text_standard": textstat.text_standard(text, float_output=True),
        "mean_parse_tree_depth": get_mean_parse_tree_depth(text),
        "mean_ents_per_sentence": get_mean_ents_per_sentence(text),
    }

    features.update(get_mean_pos_tags(text))

    return features
Exemplo n.º 3
0
 def encode(self, sentence: str) -> np.ndarray:
     if not isinstance(sentence, str):
         sentence = ''
     return np.array([
         textstat.flesch_reading_ease(sentence),
         textstat.syllable_count(sentence),
         textstat.text_standard(sentence, float_output=True),
         textstat.syllable_count(sentence),
         textstat.lexicon_count(sentence, removepunct=True)
     ])
Exemplo n.º 4
0
def get_score(text):
    scores = []
    scores.append((tst.avg_sentence_length(text) - MEAN_SL) / STD_SL)
    scores.append((tst.avg_letter_per_word(text) - MEAN_AL) / STD_AL)
    scores.append(tst.avg_sentence_per_word(text))
    scores.append((tst.sentence_count(text) - MEAN_SC) / STD_SC)
    scores.append((tst.flesch_kincaid_grade(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.flesch_reading_ease(text) - 50) / 50)
    scores.append((tst.smog_index(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.coleman_liau_index(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.automated_readability_index(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.dale_chall_readability_score(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.linsear_write_formula(text) - MEAN_GRADE) / MEAN_GRADE)
    scores.append((tst.gunning_fog(text) - MEAN_GRADE) / MEAN_GRADE)
    return scores
Exemplo n.º 5
0
 def post(self, args):
     text = args['text']
     readability = {}
     readability["flesch_reading_ease"] = textstat.flesch_reading_ease(text)
     readability["flesch_kincaid_grade"] = textstat.flesch_kincaid_grade(
         text)
     readability["smog_index"] = textstat.smog_index(text)
     readability["coleman_liau_index"] = textstat.coleman_liau_index(text)
     readability[
         "automated_readability_index"] = textstat.automated_readability_index(
             text)
     readability[
         "dale_chall_readability_score"] = textstat.dale_chall_readability_score(
             text)
     readability["linsear_write_formula"] = textstat.linsear_write_formula(
         text)
     readability["gunning_fog"] = textstat.gunning_fog(text)
     readability["text_standard"] = textstat.text_standard(text)
     readability["difficult_words"] = textstat.difficult_words(text)
     return jsonify(readability)
Exemplo n.º 6
0
def inputNumber(message):
    while True:
        try:
            userInput = int(input(message))
        except ValueError:
            print("Invalid input. Please enter a number: 1, 2, 3, or 4.")
            continue
        if userInput not in [1, 2, 3, 4]:
            print("Invalid integer. Please enter 1, 2, 3, or 4.")
            continue
##############################################################################################################
#######--------CHOICE-#1:-DOCUMENT-FILE----------------------------------------------------------##############
##############################################################################################################
        if userInput == 1:
            docchoice = input("Please enter the name of the Text File.\n")
            sourcedoc = open(docchoice, 'r')
            readsource = sourcedoc.read()
            lowfile = readsource.lower()
            #            filesoup = BeautifulSoup(lowfile,'lxml')
            #            filetext = filesoup.get_text(strip = True)
            #            sent = TextBlob(filetext)
            sent = TextBlob(lowfile)
            slashsplice = sent.replace('/', ' ')
            dashsplice = (slashsplice.replace('-', ' '))
            dashsplice2 = (dashsplice.replace('–', ' '))
            sentblob = TextBlob(lowfile)
            filepunct = TextBlob(str(remove_punctuation(dashsplice2)))
            finaltext = str(remove_punctuation(dashsplice2))
            print("\n-----------------------------------------------")
            print("-----Sentiment Analysis Guide------------------")
            print("-----------------------------------------------")
            print(
                "    Polarity(Emotion): \n    [ -1:Negative,   0:Neutral,   1:Positive ]"
            )
            print(
                "\n    Subjectivity(Fact VS Opinion): \n    [ 0:Objective    1:Subjective ]"
            )
            print("------------------------------------------------")
            polar = sentblob.sentiment.polarity
            subject = sentblob.sentiment.subjectivity
            print("\n|------------------------------------|")
            print("|-----SENTIMENT ANALYSIS RESULTS-----|")
            print("|------------------------------------|")
            print("|    Polarity: ", polar,
                  "                \n|    Subjectivity: ", subject,
                  "            ")
            print("|------------------------------------|")
            tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
            words_and_tags = [(w, tag_dict.get(pos[0], 'n'))
                              for w, pos in filepunct.tags]
            lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
            punctuate = str.maketrans('', '', string.punctuation)
            tokens = [w.translate(punctuate) for w in lemmatized_list]
            #            splitpunct = filepunct.split()
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', '2', '1', '0',
                '–', '’', '’', '“', '”'
            ]
            #            tokens = [w for w in splitpunct]
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stoplist:
                    clean_tokens.remove(token)
            count = Counter(clean_tokens)
            print("\n-------30 MOST COMMON WORDS-------: \n")
            for key, value in count.most_common(30):
                print("   " + str(value) + " - " + key)
            print("\n-------FREQUENCY CHART-------:")
            freq = nltk.FreqDist(clean_tokens)
            freq.plot(15, cumulative=False)
            ##---------------PHRASE (1,2,3,4 WORDS) COUNTER----------------------------------------
            bitokens = nltk.word_tokenize(finaltext)
            bgs = nltk.ngrams(bitokens, 2)
            fdist = nltk.FreqDist(bgs)
            count = fdist.most_common(10)
            tgs = nltk.ngrams(bitokens, 3)
            fdist2 = nltk.FreqDist(tgs)
            count2 = fdist2.most_common(10)
            qgs = nltk.ngrams(bitokens, 4)
            fdist3 = nltk.FreqDist(qgs)
            count3 = fdist3.most_common(10)
            print("\n--------COMMON PHRASES (2 WORDS)--------:\n")
            for (key, key2), value in count:
                print("   ", key, "", key2, "", "-", value)
            print("\n--------COMMON PHRASES (3 WORDS)--------:\n")
            for (key, key2, key3), value in count2:
                print("   ", key, "", key2, "", key3, "-", value)
            print("\n--------COMMON PHRASES (4 WORDS)--------:\n")
            for (key, key2, key3, key4), value in count3:
                print("   ", key, "", key2, "", key3, "", key4, "-", value)
####---------------------READABILITY INDEX--------------------###########
            flesh = int(textstat.flesch_reading_ease(readsource))
            print("--------FLESCH-KINCLAID TEST--------\n",
                  "\n    Readability Score: ", flesh)
            if flesh in range(0, 30):
                print(
                    "    Very difficult to read. Best understood by university graduates."
                )
            if flesh in range(31, 50):
                print("    Difficult to read.")
            if flesh in range(51, 60):
                print("    Fairly difficult to read.")
            if flesh in range(61, 70):
                print(
                    "    Plain English. Easily understood by 13- to 15-year-old students."
                )
            if flesh in range(71, 80):
                print("    Fairly easy to read.")
            if flesh in range(81, 90):
                print("    Fairly easy to read.")
            if flesh in range(90, 100):
                print(
                    "    Very easy to read. Easily understood by an average 11-year-old student."
                )
            print("-----------------------------------\n")

            ##################---END. LOOP---##########################################################################################################
            again = input(
                "\nThank you for using BTL 0.6. Run Again? [Y / N]\n")
            acceptable = ["Y", "y", "N", "n"]
            if again in ["Y", "y"]:
                print("What kind of document?")
                return inputNumber(message)
            if again in ["N", "n"]:
                quit()
            while again not in acceptable:
                print(
                    "\nSorry, didn't catch that. Please select an option below:"
                )
                return inputNumber(message)
            break

##############################################################################################################
####----------CHOICE-#2:-URL/LINK-------------------------------------------------------------------------------
##############################################################################################################
        if userInput == 2:
            webchoice = input("Please enter the URL of the website.\n")
            webdoc = urllib.request.urlopen(webchoice)
            readweb = webdoc.read()
            websoup = w3lib.html.remove_tags(readweb)
            #            websoup = BeautifulSoup(readweb,'html5lib')
            #  websoup2 = websoup.text
            print(websoup)
            lowweb = websoup.lower()
            websent = TextBlob(lowweb)
            slashsplice = websent.replace('/', ' ')
            dashsplice = (slashsplice.replace('-', ' '))
            dashsplice2 = (dashsplice.replace('–', ' '))
            dashsplice3 = (dashsplice2.replace(' – ', ' '))
            pagesplice = dashsplice3.replace(' p. ', ' ')
            pagesplice2 = pagesplice.replace(' pp.', ' ')
            webpunct = TextBlob(str(remove_punctuation(pagesplice2)))
            finalweb = str(remove_punctuation(pagesplice2))
            print("\n-----------------------------------------------")
            print("-----Sentiment Analysis Guide------------------")
            print("-----------------------------------------------")
            print(
                "    Polarity(Emotion): \n    [ -1:Negative,   0:Neutral,   1:Positive ]"
            )
            print(
                "\n    Subjectivity(Fact VS Opinion): \n    [ 0:Objective    1:Subjective ]"
            )
            print("------------------------------------------------")
            polar = websent.sentiment.polarity
            subject = websent.sentiment.subjectivity
            print("\n|------------------------------------|")
            print("|-----SENTIMENT ANALYSIS RESULTS-----|")
            print("|------------------------------------|")
            print("|    Polarity: ", polar,
                  "                \n|    Subjectivity: ", subject,
                  "            ")
            print("|------------------------------------|")
            tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
            words_and_tags = [(w, tag_dict.get(pos[0], 'n'))
                              for w, pos in webpunct.tags]
            lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
            punctuate = str.maketrans('', '', string.punctuation)
            tokens = [w.translate(punctuate) for w in lemmatized_list]
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', " ",
                'mwparseroutput', 'wwww3org', 'xmlnshttp', 'also', '1', '0',
                'svg', '2', 'jw', '’', '“', '”', 'u'
            ]
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stoplist:
                    clean_tokens.remove(token)
            count = Counter(clean_tokens)
            print("\n---------MOST COMMON WORDS---------: \n")
            for key, value in count.most_common(30):
                print("   " + key + " - " + str(value))
            print("\n---------FREQUENCY CHART---------:")
            freq = nltk.FreqDist(clean_tokens)
            freq.plot(10, cumulative=False)
            #################################################################################################
            ##---------------PHRASE (1,2,3,4) COUNTER----------------------------------------
            ###################################################################################
            bitokens = nltk.word_tokenize(finalweb)
            bgs = nltk.ngrams(bitokens, 2)
            fdist = nltk.FreqDist(bgs)
            count = fdist.most_common(20)
            tgs = nltk.ngrams(bitokens, 3)
            fdist2 = nltk.FreqDist(tgs)
            count2 = fdist2.most_common(20)
            qgs = nltk.ngrams(bitokens, 4)
            fdist3 = nltk.FreqDist(qgs)
            count3 = fdist3.most_common(20)
            print("\n--------COMMON PHRASES (2 WORDS)--------:\n")
            for (key, key2), value in count:
                print("   ", key, "", key2, "", "-", value)
            print("\n--------COMMON PHRASES (3 WORDS)--------:\n")
            for (key, key2, key3), value in count2:
                print("   ", key, "", key2, "", key3, "-", value)
            print("\n--------COMMON PHRASES (4 WORDS)--------:\n")
            for (key, key2, key3, key4), value in count3:
                print("   ", key, "", key2, "", key3, "", key4, "-", value)
    #################################################################################################
    ##---------------READABILITY INDEX----------------------------------------
    ###################################################################################
    ##########---------------END LOOP---------------------##############################
            again = input("\nThank you for using BTL 0.6. Run Again? [Y / N]")
            acceptable = ["Y", "y", "N", "n"]
            if again in ["Y", "y"]:
                print("What kind of document?")
                return inputNumber(message)
            if again in ["N", "n"]:
                print("Bye!")
                quit()
            while again not in acceptable:
                print(
                    "\nSorry, didn't catch that. Please select an option below:"
                )
                return inputNumber(message)
            break

########################################################################################################################
############--------CHOICE-#3:-MANUAL-INPUT----------########################################
############################################################################################################

        if userInput == 3:
            manchoice = input("Please enter your text here:\n")
            lowman = manchoice.lower()
            mansoup = BeautifulSoup(lowman, 'html5lib')
            mantext = mansoup.get_text(strip=True)
            mansent = TextBlob(mantext)
            sent = TextBlob(manchoice)
            manpunct = TextBlob(str(remove_punctuation(mansent)))
            finalman = str(remove_punctuation(mansent))
            splitpunct = manpunct.split()
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', '0', '–', '’',
                '“', '”', '’'
            ]
            print("\n-----------------------------------------------")
            print("-----Sentiment Analysis Guide------------------")
            print("-----------------------------------------------")
            print(
                "    Polarity(Emotion): \n    [ -1:Negative,   0:Neutral,   1:Positive ]"
            )
            print(
                "\n    Subjectivity(Fact VS Opinion): \n    [ 0:Objective    1:Subjective ]"
            )
            print("------------------------------------------------")
            polar = sent.sentiment.polarity
            subject = sent.sentiment.subjectivity
            print("\n|------------------------------------|")
            print("|-----SENTIMENT ANALYSIS RESULTS-----|")
            print("|------------------------------------|")
            print("|    Polarity: ", polar,
                  "                \n|    Subjectivity: ", subject,
                  "            ")
            print("|------------------------------------|")
            tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
            words_and_tags = [(w, tag_dict.get(pos[0], 'n'))
                              for w, pos in manpunct.tags]
            lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
            punctuate = str.maketrans('', '', string.punctuation)
            #            tokens = [w.translate(punctuate) for w in lemmatized_list]
            tokens = [w for w in splitpunct]
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', '—'
            ]
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stoplist:
                    clean_tokens.remove(token)
            count = Counter(clean_tokens)
            print("\n------35 MOST COMMON WORDS------: \n")
            for key, value in count.most_common(35):
                print("   " + key + " - " + str(value))
            print("\n------FREQUENCY CHART------:")
            freq = nltk.FreqDist(clean_tokens)
            freq.plot(10, cumulative=False)
            #################################################################################################
            ##---------------PHRASE (1,2,3,4 WORDS) COUNTER----------------------------------------
            ##################################################################################
            bitokens = nltk.word_tokenize(finalman)
            bgs = nltk.ngrams(bitokens, 2)
            fdist = nltk.FreqDist(bgs)
            count = fdist.most_common(10)
            tgs = nltk.ngrams(bitokens, 3)
            fdist2 = nltk.FreqDist(tgs)
            count2 = fdist2.most_common(10)
            qgs = nltk.ngrams(bitokens, 4)
            fdist3 = nltk.FreqDist(qgs)
            count3 = fdist3.most_common(10)
            print("\n--------COMMON PHRASES (2 WORDS)--------:\n")
            for (key, key2), value in count:
                print("   ", key, "", key2, "", "-", value)
            print("\n--------COMMON PHRASES (3 WORDS)--------:\n")
            for (key, key2, key3), value in count2:
                print("   ", key, "", key2, "", key3, "-", value)
            print("\n--------COMMON PHRASES (4 WORDS)--------:\n")
            for (key, key2, key3, key4), value in count3:
                print(
                    "   ",
                    key,
                    "",
                    key2,
                    "",
                    key3,
                    "",
                    key4,
                    "-",
                    value,
                )
    ######---------------READABILITY INDEX#----------------####
            flesh = int(textstat.flesch_reading_ease(manchoice))
            print("\n----------FLESCH-KINCLAID TEST----------:\n",
                  "\n    Readability Score: ", flesh, "\n")
            if flesh in range(0, 31):
                print(
                    "    --Very difficult to read. Best understood by university graduates.--"
                )
            if flesh in range(31, 51):
                print("    --Difficult to read.--")
            if flesh in range(51, 61):
                print("    --Fairly difficult to read.--")
            if flesh in range(61, 71):
                print(
                    "    --Plain English. Easily understood by 13 to 15-year-old students.--"
                )
            if flesh in range(71, 81):
                print("    --Fairly easy to read.--")
            if flesh in range(81, 91):
                print("    --Fairly easy to read.--")
            if flesh in range(91, 100):
                print(
                    "    --Very easy to read. Easily understood by an average 11-year-old student.--"
                )
            print("\n------------------------------------------\n")

            again = input("\nThank you for using BTL 0.3. Run Again? [Y / N]")
            acceptable = ["Y", "y", "N", "n"]
            if again in ["Y", "y"]:
                print("What kind of document?")
                return inputNumber(message)
            if again in ["N", "n"]:
                print("Bye!")
                quit()
            while again not in acceptable:
                print(
                    "\nSorry, didn't catch that. Please select an option below:"
                )
                return inputNumber(message)
            break
###################################################################################################################
##########---------CHOICE 4: QUIT PROGRAM-------------------------------------------------------------------------------
######################################################################################################################
        if userInput == 4:
            print("Thank you for using BTL 0.5. Bye!")
            quit()
            break
Exemplo n.º 7
0
 def get_score(self):
     self.input_data = self.input_data.replace("\n", ". ")
     return textstat.flesch_reading_ease(self.input_data)
Exemplo n.º 8
0
from textstat import textstat
if __name__ == '__main__':
    test_data = 'The quick brown fox jumps over the lazy dog'

#File to be used to test the function
print(textstat.flesch_reading_ease(test_data))
print(textstat.smog_index(test_data))
print(textstat.flesch_kincaid_grade(test_data))
print(textstat.coleman_liau_index(test_data))
print(textstat.automated_readability_index(test_data))
print(textstat.dale_chall_readability_score(test_data))
print(textstat.difficult_words(test_data))
print(textstat.linsear_write_formula(test_data))
print(textstat.gunning_fog(test_data))
print(textstat.text_standard(test_data))
Exemplo n.º 9
0
def index():
    data = request.json
    print(f'Debug: {data}')

    unique_id = data['unique_id']
    process_language = data['process_language']
    message = data['message']

    matches_list = None
    if process_language:
        # Language tool takes a while to process
        language_tool = LanguageTool('en-US')
        matches: list[Match] = language_tool.check(message)

        matches_list = []
        for match in matches:
            matches_list.append(match_to_dict(match))
        print(f'Analysis finished: {matches_list}')

    sentences: list = splitter.split(text=message)

    return {
        'unique_id': unique_id,
        'text_statistics': {
            'lexicon_count': textstat.lexicon_count(message),
            'lexicon_count_ps': list_map(sentences, textstat.lexicon_count),
            'syllable_count': textstat.syllable_count(message),
            'syllable_count_ps': list_map(sentences, textstat.syllable_count),
            'sentences': sentences,
            'sentence_count': len(sentences),
            'readability': {
                'flesch_reading_ease': {
                    'score': textstat.flesch_reading_ease(message),
                    'sps': list_map(sentences, textstat.flesch_reading_ease)
                },
                'smog_index': {
                    'score': textstat.smog_index(message)
                },
                'flesch_kincaid_grade': {
                    'score': textstat.flesch_kincaid_grade(message),
                    'sps': list_map(sentences, textstat.flesch_kincaid_grade)
                },
                'coleman_liau_index': {
                    'score': textstat.coleman_liau_index(message),
                    'sps': list_map(sentences, textstat.coleman_liau_index)
                },
                'automated_readability_index': {
                    'score':
                    textstat.automated_readability_index(message),
                    'sps':
                    list_map(sentences, textstat.automated_readability_index)
                },
                'dale_chall_readability_score': {
                    'score':
                    textstat.dale_chall_readability_score(message),
                    'sps':
                    list_map(sentences, textstat.dale_chall_readability_score)
                },
                'difficult_words': {
                    'score': textstat.difficult_words(message),
                    'sps': list_map(sentences, textstat.difficult_words),
                    'words': textstat.difficult_words_list(message)
                },
                'linsear_write_formula': {
                    'score': round(textstat.linsear_write_formula(message), 2),
                    'sps': list_map(sentences, textstat.linsear_write_formula)
                },
                'gunning_fog': {
                    'score': textstat.gunning_fog(message),
                    'sps': list_map(sentences, textstat.gunning_fog)
                },
                'text_standard': {
                    'score': textstat.text_standard(message)
                }
            }
        },
        'language_tool': matches_list
    }
Exemplo n.º 10
0
#%%
from textstat import textstat
import json

#%%
# with open('../frontend/assets/questions.json') as f:
#     questions = json.load(f)
# for passage in questions['passages']:
#     passage_text = ' '.join(passage['passage'])
#     print('Passage: ', passage_text)
#     print(textstat.flesch_reading_ease(passage_text))

passage_text = """
When men of great intellect, who have long and intently and exclusively given themselves to the study or investigation of some one particular branch of secular knowledge, whose mental life is concentrated and hidden in their chosen pursuit, and who have neither eyes nor ears for any thing which does not immediately bear upon it, when such men are at length made to realize that there is a clamour all around them, which must be heard, for what they have been so little accustomed to place in the category of knowledge as Religion, and that they themselves are accused of disaffection to it, they are impatient at the interruption; they call the demand tyrannical, and the requisitionists bigots or fanatics. They are tempted to say, that their only wish is to be let alone; for themselves, they are not dreaming of offending any one, or interfering with any one; they are pursuing their own particular line, they have never spoken a word against any one's religion, whoever he may be, and never mean to do so. It does not follow that they deny the existence of a God, because they are not found talking of it, when the topic would be utterly irrelevant. {44} All they say is, that there are other beings in the world besides the Supreme Being; their business is with them. After all, the creation is not the Creator, nor things secular religious. Theology and human science are two things, not one, and have their respective provinces, contiguous it may be and cognate to each other, but not identical. When we are contemplating earth, we are not contemplating heaven; and when we are contemplating heaven, we are not contemplating earth. Separate subjects should be treated separately. As division of labour, so division of thought is the only means of successful application. "Let us go our own way," they say, "and you go yours. We do not pretend to lecture on Theology, and you have no claim to pronounce upon Science."
"""
print(textstat.flesch_reading_ease(passage_text))
Exemplo n.º 11
0
 def _getReadability(self, text):
     # print("Reading scores: ", textstat.flesch_reading_ease(text))
     return textstat.flesch_reading_ease(text)
Exemplo n.º 12
0
def extract_features(dataframe,
                     use_token=True,
                     use_sentence=True,
                     use_word_embeddings=True,
                     use_readability_measures=False):
    """

    Parameters
    ----------
    use_sentence
    use_readability_measures
    dataframe
    use_token
    use_word_embeddings

    Returns
    -------

    """
    dataframe["subcorpus"] = \
        ENCODER.fit_transform(dataframe["subcorpus"])

    if use_sentence:
        # dataframe['sentence_length'] = dataframe['sentence'].str.len()
        dataframe["sentence_word_count"] = dataframe["sentence"].str.split(
        ).str.len()
        # dataframe["sentence_avg_word_length"] = round(
        #     dataframe["sentence_length"] / dataframe[
        #         "sentence_word_count"]).astype(int)
        # dataframe["sentence_vowel_count"] = dataframe[
        #     "sentence"].str.lower().str.count(r'[aeiou]')

    if use_readability_measures:
        # dataframe["sentence_gunning_fog"] = \
        #     dataframe.apply(lambda row:
        #                     textstat.gunning_fog(row['sentence']),
        #                     axis=1)
        dataframe["sentence_flesch_reading_ease"] = \
            dataframe.apply(lambda row:
                            textstat.flesch_reading_ease(row['sentence']),
                            axis=1)
        # dataframe["sentence_dale_chall"] = \
        #     dataframe.apply(lambda row:
        #                     textstat.dale_chall_readability_score(
        #                         row['sentence']), axis=1)
        # dataframe["sentence_syllable_count"] = \
        #     dataframe.apply(lambda row:
        #                     textstat.syllable_count(row['sentence']),
        #                     axis=1)

    if use_token:
        # dataframe["token_vowel_count"] = [
        #     textstat.syllable_count(item) for item
        #     in dataframe['sentence'].to_list()
        # ]
        dataframe['token_wordnet_senses'] = \
            dataframe.apply(lambda row:
                            Meaning.count_wordnet_senses(row['token']), axis=1)
        # dataframe['token_proper_noun'] =\
        #     dataframe.apply(lambda row: Meaning.is_proper_name(row["token"]),
        #                     axis=1)

        dataframe['token_pos_tag'] = \
            dataframe.apply(lambda row: Meaning.get_pos_tag(row["token"]),
                            axis=1)
        dataframe["token_pos_tag"] = \
            ENCODER.fit_transform(dataframe["token_pos_tag"])

        dataframe['upper'] = dataframe['token'].apply(lambda word: 1
                                                      if word.isupper() else 0)
        dataframe['upper_first'] = dataframe['token'].apply(
            lambda word: 1 if word[0].isupper() else 0)

        dataframe["token_vowel_count"] = dataframe["token"].str.lower(
        ).str.count(r'[aeiou]')

        # dataframe["token_freq"] = [
        #    freq_overall_corpus(item) for item in dataframe['token']
        # ]
        dataframe["token_freq_bible"] = [
            COUNTS.get_logarithmic_count(item, "bible")
            for item in dataframe['token']
        ]
        dataframe["token_freq_europarl"] = [
            COUNTS.get_logarithmic_count(item, "europarl")
            for item in dataframe['token']
        ]

    if use_word_embeddings:
        embedder_bible = FastTextEmbedder(model_name='ft_bible.bin')
        embedder_eu = FastTextEmbedder(model_name='ft_europarl.bin')
        embedder_pubmed = FastTextEmbedder(model_name='ft_pubmed.bin')

        dataframe["ft_embedding_bible"] = dataframe['sentence'].\
            apply(embedder_bible.get_mean_vector)
        dataframe["ft_embedding_europarl"] = dataframe['sentence'].\
            apply(embedder_eu.get_mean_vector)
        dataframe["ft_embedding_pubmed"] = dataframe['sentence'].\
            apply(embedder_pubmed.get_mean_vector)

        dataframe["ft_token_embedding_bible"] = dataframe["token"].\
            apply(embedder_bible.get_mean_vector)
        dataframe["ft_token_embedding_europarl"] = dataframe["token"].\
            apply(embedder_eu.get_mean_vector)
        dataframe["ft_token_embedding_pubmed"] = dataframe["token"].\
            apply(embedder_pubmed.get_mean_vector)

    return dataframe