Пример #1
0
def extract_readability_scores(policy_folder):
    read = Readability()
    nlp.add_pipe(read, last=True)
    print ("Policy, Grade, Ease")
    for filename in os.listdir(policy_folder):
        if filename[0] != ".":
            with open(os.path.join(policy_folder, filename), 'r') as myfile:
                data = myfile.read()
                doc = nlp(data)
                print("{0}, {1}, {2}".format(filename,doc._.flesch_kincaid_grade_level,doc._.flesch_kincaid_reading_ease))
Пример #2
0
 def __init__(self, data, spacy_model='en_core_web_lg', feature_array=None):
     """
     Instantiates SimilarityVectorizer, loads spaCy model
     """
     print("Initializing spaCy...")
     self.spacy_model = spacy_model
     self.nlp = spacy.load(spacy_model)
     self.read = Readability()
     self.nlp.add_pipe(self.read, last=True)
     if isinstance(data, dict):
         self.data_dict = data
         self.data = None
     else:
         self.data = data
         self.data_dict = self.data.data_dict
     self.feature_array = feature_array
Пример #3
0
def compute_readability(nlp, sentences):
    read = Readability()
    nlp.add_pipe(read, last=True)

    scores = []
    if len(sentences) == 0:
        return 0.0
    # Token.set_extension('context', default=False, force=True)
    for s in sentences:
        sent = nlp(s)
        avg_score = sent._.flesch_kincaid_grade_level + sent._.coleman_liau_index
        # sent._.automated_readability_index + \
        if not avg_score:
            scores.append(0)
        else:
            scores.append(avg_score / 3)
    return np.mean(scores)
Пример #4
0
    def __init__(self, config, max_workers=None, verbose=False):
        self.config = config
        self.available_languages = {
            'en': 'en_core_web_md',
            'de': 'de_core_news_sm',
            'fr': 'fr_core_news_sm',
            'es': 'es_core_news_sm',
            'it': 'it_core_news_sm',
            'multi': 'xx_ent_wiki_sm'
        }
        # we can use also BERT distance, but it's slower and does not support multi language
        # self.distance = BERT_distance()
        print("Preloading Word Embeddings for selected languages...")
        # list of the language we want to suppport
        dim = 200
        vs = 200000
        self.languages = config.languages

        # Checking for available languages
        for lang in self.languages:
            if lang not in self.available_languages:
                raise Exception(
                    "Sorry, language '{}' not yet supported".format(lang))

        self.verbose = verbose
        self.max_workers = max_workers
        self.transition = transitions_handler(self.config.transition_data_path)

        self.model_summarizer = {
            l: ModelSummarizer(config, lang=l, verbose=self.verbose)
            for l in self.languages
        }
        self.embedder = {l: SisterEmbedder(lang=l) for l in self.languages}
        self.nlp = {
            l: spacy.load(self.available_languages[l])
            for l in self.languages
        }
        # Add Readability to nlp pipe
        for lang in self.nlp:
            read = Readability()
            nlp = self.get_nlp(lang)
            nlp.add_pipe(read, last=True)
Пример #5
0
    def _calculate_readability(self, doc: Doc):
        """
        Call the readability score functions
        """
        assert doc.has_extension(STAGE.READABILITY)
        readability = Readability()
        scores = {"summary": {}, "text": {}}
        scores["text"]["dale_chall"] = readability.dale_chall(doc)
        scores["text"]["smog"] = readability.smog(doc)
        if self.summary_doc:
            scores["summary"]["dale_chall"] = readability.dale_chall(
                self.summary_doc)
            scores["summary"]["smog"] = readability.smog(self.summary_doc)

        return scores
Пример #6
0
import json
import pandas as pd
import spacy
#import neuralcoref
import pytextrank
import networkx as nx
from spacy_readability import Readability
import os

nlp = spacy.load('en_core_web_md')
#neuralcoref.add_to_pipe(nlp)

tr = pytextrank.TextRank()
nlp.add_pipe(tr.PipelineComponent, name="textrank")

read = Readability()
nlp.add_pipe(read, last=True)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)


def get_df(path):
    i = 0
    df = {}
Пример #7
0




#pip install spacy

#spacy.load("en_core_web_sm")
#pip install spacy-readability
#pip install spacy_readability


import spacy
from spacy_readability import Readability
nlp = spacy.load('en_core_web_sm')
read = Readability(nlp)


y = Counter(([token.pos_ for token in nlp('The cat sat on the mat.')]))

y['NOUN']/len([token.pos_ for token in nlp('The cat sat on the mat.')])



dfall = pd.DataFrame()    
df2 = pd.DataFrame()
place = 0
for i in df['body']:
    i = str(i)
    y = Counter(([token.pos_ for token in nlp(i)]))
    noun = y['NOUN']/len([token.pos_ for token in nlp(i)])
Пример #8
0
    def __init__(self,
                 model='en_core_web_sm',
                 sources_csv=None,
                 wikifier_output_dir='',
                 max_length=3000000):
        """Initialize the preprocessor."""

        # Save wikifier option
        self.wikifier_output_dir = wikifier_output_dir

        # Load the language model
        # print('Preparing language model...')
        self.nlp = spacy.load(model)
        self.nlp.max_length = max_length

        # Import readability
        # print('Testing readability...')
        try:
            from spacy_readability import Readability
            self.collect_readability_scores = True
        except:
            msg = """The spacy-readability module is not installed on your system.
            Readability scores will be unavailable unless you `pip install spacy-_readability`."""
            # print(msg)
            self.collect_readability_scores = False
            pass

        # Configure language model options
        self.add_stopwords = []
        self.remove_stopwords = []
        self.skip_entities = [
            'CARDINAL', 'DATE (except months)', 'QUANTITY', 'TIME'
        ]
        self.lemmatization_cases = {
            "humanities": [{
                ORTH: u'humanities',
                LEMMA: u'humanities',
                POS: u'NOUN',
                TAG: u'NNS'
            }]
        }

        # Configure entity categories to be skipped when merging entities
        self.options = {
            'merge_noun_chunks': False,
            'merge_subtokens': False,
            'skip_ents': self.skip_entities,
            'collect_readability_scores': self.collect_readability_scores
        }

        # Handle lemmatisation exceptions
        for k, v in self.lemmatization_cases.items():
            self.nlp.tokenizer.add_special_case(k, v)

        # Add and remove custom stop words - disabled for optimisation
        # for word in self.add_stopwords:
        #     self.nlp.vocab[word].is_stop = True
        # for word in self.remove_stopwords:
        #     self.nlp.vocab[word].is_stop = False

        self.nlp.add_pipe(self.skip_ents, after='ner')

        # Add readability to pipeline
        if self.collect_readability_scores == True:
            self.nlp.add_pipe(Readability())

        # Load the sources file - disabled for optimisation
        self.sources = ''
        if sources_csv:
            with open(sources_csv, 'r') as f:
                self.sources = [dict(line) for line in csv.DictReader(f)]
Пример #9
0
class NLP():
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(Readability(), last=True)
    matcher = Matcher(nlp.vocab)
    def __init__(self, text):
        
        
        self.doc = self.nlp(text)
        self.blob = TextBlob(self.doc.text)
        self.readability = self.readability_indexes()
        self.word_tokens = self.tokenize_words(self.doc)
        
        self.sents = list(self.doc.sents)
        self.polysyllables = self.get_polysyllables(self.word_tokens[1])
        self.nominalized_words = self.get_nominalized(self.word_tokens[1])
        self.pos = self.get_pos(self.doc)
        self.prepositional_phrases = self.get_pps(self.doc)
        self.passive_phrases = self.get_passive_phrases(self.doc)
        self.get_pronouns(self.doc)
        self.get_weak_verbs(self.doc)
        self.sentence_count = len(self.sents)
        self.statistics()
        self.word_count = len(self.word_tokens[1])
        self.get_freq_dist()
        #self.lexicon_count = len(self.lexicon)
        self.get_intities()
    def readability_indexes(self):
        readability_scores = {}
        readability_scores['ari'] = self.doc._.automated_readability_index
        readability_scores['coleman_liau_index'] = self.doc._.coleman_liau_index
        readability_scores['dale_chall'] = self.doc._.dale_chall
        readability_scores['flesch_kincaid_grade'] = self.doc._.flesch_kincaid_grade_level
        readability_scores['flesch_kincaid_re'] = self.doc._.flesch_kincaid_reading_ease
        readability_scores['forcast'] = self.doc._.forcast
        readability_scores['smog'] = self.doc._.smog
        return readability_scores
    
    def tokenize_words(self, document):
        spacy_word_tokens = [t.text for t in document]
        no_punct_word_tokens = []
        for w in spacy_word_tokens:
            for p in punctuation:
                w = w.replace(p, "").replace("\n", "").replace("", '')
            no_punct_word_tokens.append(w.lower())
        no_punct_word_tokens.remove('')
        return (spacy_word_tokens, no_punct_word_tokens)
    def get_polysyllables(self, some_list):
        polysyllables = []
        for w in some_list: 
            if syllables.estimate(w) > 3: 
                polysyllables.append(w)
        return polysyllables
    # def get_polysyllables2(self, doc):
    #     phoney = BigPhoney()
    #     self.total_syllables = phoney.count_syllables(self.doc.text)
    #     self.polys = []
    #     for token in doc:
    #         if phoney.count_syllables(token.text) > 3:
    #             self.polys.append(token.text)
    #         else:
    #             pass
    def get_nominalized(self, list):
        nominalized_words = {}
        nominalized_words['-tion words'] = []
        
        for word in list:
            if word.endswith("tion"):
                nominalized_words['-tion words'].append(word)
            
            else:
                pass
        return nominalized_words
    def get_pos(self, nlp_doc):
        parts_of_speech = {}
        parts_of_speech['gerunds'] = []
        parts_of_speech['adjectives'] = []
        parts_of_speech['adverbs'] = []
        parts_of_speech['prepositions'] = []
        for token in nlp_doc:
            if token.tag_ == "VBG":
                parts_of_speech['gerunds'].append(token.text)
            elif token.pos_ == "ADJ":
                parts_of_speech['adjectives'].append(token.text)
            elif token.pos_ == "ADV":
                parts_of_speech['adverbs'].append(token.text)
            
            else:
                pass
        return parts_of_speech

    def get_pps(self, doc):
        #Function to get prepositions from a parsed document.
        pps = []
        for token in doc:
            if token.pos_ == 'ADP':
                pp = ' '.join([tok.orth_ for tok in token.subtree])
                pps.append(pp)
        return pps

    def get_passive_phrases(self, doc):
        self.passive_sents = []
        passive_phrases = []
        passive_rule = [{'DEP': 'nsubjpass'},
        {'DEP':'aux','OP':'*'},
        {'DEP':'auxpass'},
        {'TAG':'VBN'}
        ] 
        self.matcher.add('passive', None, passive_rule)
        sents = list(doc.sents)
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            string_id = doc.vocab.strings[match_id]
            span = doc[start:end]
            passive_phrases.append(span.text)
        for s in self.sents:
            for p in passive_phrases:
                if p in s.text:
                    self.passive_sents.append(s.text)
        #return passive_phrases
    def get_weak_verbs(self, doc):
        self.weak_verbs = {}
        self.weak_verbs['to be'] = []
        self.weak_verbs['auxiliary'] = []
        for token in doc:
            if token.lemma_ == "be":
                self.weak_verbs['to be'].append(token.text)
            elif token.pos_ == 'AUX':
                self.weak_verbs['auxiliary'].append(token.text)
            else:
                pass
    def get_pronouns(self, doc):
        self.personal_pronouns = {}
        self.personal_pronouns['first person pronouns'] = []
        self.personal_pronouns['second person pronouns'] = []
        self.pronouns = []
        for token in doc:
            if token.tag_ == 'PRP' or token.tag_ == "PRP$":
                if token.text.lower() in ['i', 'me', 'mine', 'my', 'myself']:
                    self.personal_pronouns['first person pronouns'].append(token.text)
                elif token.text.lower() in ['you', 'your', 'yours', 'yourself']:
                    self.personal_pronouns['second person pronouns'].append(token.text)
                
                
                else:
                    pass
            elif token.pos_ == "PRON":
                    self.pronouns.append(token.text.lower())
            else:
                pass
    def statistics(self):
        self.statistics = {}
        self.statistics['per sentence'] = {} # rate per sentence
        self.statistics['per sentence'].update({'preposition rate':len(self.prepositional_phrases)/self.sentence_count})
        self.statistics['per sentence'].update({'be rate':len(self.weak_verbs['to be'])/self.sentence_count})   
        self.statistics['per sentence'].update({'passive rate':len(self.passive_sents)/self.sentence_count})
        self.statistics['percent of sentences'] = {}
        self.statistics['percent of sentences'].update({'prepositions':self.statistics['per sentence']['preposition rate'] * 100})
        self.statistics['percent of sentences'].update({'to be':self.statistics['per sentence']['be rate'] * 100})
        self.statistics['percent of sentences'].update({'passives':self.statistics['per sentence']['passive rate'] * 100})
        self.statistics['ratios'] = {}
        self.statistics['ratios'].update({'adverbs to adjectives':len(self.pos['adverbs'])/len(self.pos['adjectives'])})
    
    def get_freq_dist(self):
        words = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.text.isalpha() == True]
        nouns = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.pos_ == "NOUN" and token.text.isalpha() == True]     
        verbs = [token.text for token in self.doc if token.is_stop != True and token.is_punct != True and token.pos_ == "VERB" and token.text.isalpha() == True]
        
        word_freq = Counter(words)
        noun_freq = Counter(nouns)
        verb_freq = Counter(verbs)
        self.common_words = word_freq.most_common(10)
        self.common_nouns = noun_freq.most_common(10)
        self.common_verbs = verb_freq.most_common(10)
    def get_intities(self):
        self.entities = {}
        for ent in self.doc.ents:
            self.entities[ent.text] = ent.label_
Пример #10
0
    def transform(self, data):
        data = data[:]
        data = pd.DataFrame({'essay_id': data.index, 'essay': data.values})

        # 纠正词法句法错误
        tool = language_check.LanguageTool('en-US')
        data['matches'] = data['essay'].apply(lambda v: tool.check(v))
        data['corrections_num'] = data.apply(lambda l: len(l['matches']),
                                             axis=1)
        data['corrected'] = data.apply(
            lambda l: language_check.correct(l['essay'], l['matches']), axis=1)

        # 分词,对其做词性标注,命名实体识别
        tokens, sents, lemma, pos, ner, stop_words = [], [], [], [], [], STOP_WORDS
        flesch_kincaid_grade_level, flesch_kincaid_reading_ease, \
        dale_chall, smog, coleman_liau_index, automated_readability_index, \
        forcast = [], [], [], [], [], [], []
        nlp = spacy.load('en_core_web_sm')
        nlp.add_pipe(Readability())
        for essay in nlp.pipe(data['corrected'], batch_size=2, n_threads=2):
            if essay.is_parsed:
                tokens.append([e.text for e in essay])
                sents.append([sent.string.strip() for sent in essay.sents])
                pos.append([e.pos_ for e in essay])
                ner.append([e.text for e in essay.ents])
                lemma.append([n.lemma_ for n in essay])
                flesch_kincaid_grade_level.append(
                    essay._.flesch_kincaid_grade_level)
                flesch_kincaid_reading_ease.append(
                    essay._.flesch_kincaid_reading_ease)
                dale_chall.append(essay._.dale_chall)
                smog.append(essay._.smog)
                coleman_liau_index.append(essay._.coleman_liau_index)
                automated_readability_index.append(
                    essay._.automated_readability_index)
                forcast.append(essay._.forcast)
            else:
                tokens.append(None)
                sents.append(None)
                pos.append(None)
                ner.append(None)
                lemma.append(None)
                flesch_kincaid_grade_level.append(None)
                flesch_kincaid_reading_ease.append(None)
                dale_chall.append(None)
                smog.append(None)
                coleman_liau_index.append(None)
                automated_readability_index.append(None)
                forcast.append(None)
        # 词性标注,命名实体识别,词根化,分词断句
        data['tokens'], data['sents'], data['lemma'], data['pos'], data[
            'ner'] = tokens, sents, lemma, pos, ner
        # 可读性特征
        data['flesch_kincaid_grade_level'], data['flesch_kincaid_reading_ease'], data['dale_chall'], data['smog'], data[
            'coleman_liau_index'], data['automated_readability_index'], data['forcast'] = \
            flesch_kincaid_grade_level, flesch_kincaid_reading_ease, dale_chall, smog, coleman_liau_index, automated_readability_index, forcast

        # 提取各种特征
        data['token_count'] = data.apply(lambda x: len(x['tokens']), axis=1)
        data['unique_token_count'] = data.apply(
            lambda x: len(set(x['tokens'])), axis=1)
        data['type_token_ratio'] = data.apply(
            lambda x: x['unique_token_count'] / x['token_count'], axis=1)
        data['sent_count'] = data.apply(lambda x: len(x['sents']), axis=1)
        data['ner_count'] = data.apply(lambda x: len(x['ner']), axis=1)
        data['comma'] = data.apply(lambda x: x['corrected'].count(','), axis=1)
        data['quotation'] = data.apply(
            lambda x: x['corrected'].count('\'') + x['corrected'].count('\"'),
            axis=1)
        data['exclamation'] = data.apply(lambda x: x['corrected'].count('!'),
                                         axis=1)

        data['organization'] = data.apply(
            lambda x: x['corrected'].count(r'@ORGANIZATION'), axis=1)
        data['caps'] = data.apply(lambda x: x['corrected'].count(r'@CAPS'),
                                  axis=1)
        data['person'] = data.apply(lambda x: x['corrected'].count(r'@PERSON'),
                                    axis=1)
        data['location'] = data.apply(
            lambda x: x['corrected'].count(r'@LOCATION'), axis=1)
        data['money'] = data.apply(lambda x: x['corrected'].count(r'@MONEY'),
                                   axis=1)
        data['time'] = data.apply(lambda x: x['corrected'].count(r'@TIME'),
                                  axis=1)
        data['date'] = data.apply(lambda x: x['corrected'].count(r'@DATE'),
                                  axis=1)
        data['percent'] = data.apply(
            lambda x: x['corrected'].count(r'@PERCENT'), axis=1)
        data['at_num'] = data.apply(lambda x: x['corrected'].count(r'@NUM'),
                                    axis=1)

        data['noun'] = data.apply(lambda x: x['pos'].count('NOUN'), axis=1)
        data['adj'] = data.apply(lambda x: x['pos'].count('ADJ'), axis=1)
        data['pron'] = data.apply(lambda x: x['pos'].count('PRON'), axis=1)
        data['verb'] = data.apply(lambda x: x['pos'].count('VERB'), axis=1)
        data['noun'] = data.apply(lambda x: x['pos'].count('NOUN'), axis=1)
        data['cconj'] = data.apply(lambda x: x['pos'].count('CCONJ'), axis=1)
        data['sconj'] = data.apply(lambda x: x['pos'].count('SCONJ'), axis=1)
        data['adv'] = data.apply(lambda x: x['pos'].count('ADV'), axis=1)
        data['det'] = data.apply(lambda x: x['pos'].count('DET'), axis=1)
        data['propn'] = data.apply(lambda x: x['pos'].count('PROPN'), axis=1)
        data['num'] = data.apply(lambda x: x['pos'].count('NUM'), axis=1)
        data['part'] = data.apply(lambda x: x['pos'].count('PART'), axis=1)
        data['intj'] = data.apply(lambda x: x['pos'].count('INTJ'), axis=1)
        data['aux'] = data.apply(lambda x: x['pos'].count('AUX'), axis=1)
        data['adp'] = data.apply(lambda x: x['pos'].count('ADP'), axis=1)
        data['punct'] = data.apply(lambda x: x['pos'].count('PUNCT'), axis=1)

        data['formal'] = data.apply(style_features, axis=1)

        connective_words = self._read_connective_words()
        data['cohesion'] = data.apply(lambda x: sum(
            [1 if t in connective_words else 0 for t in x['tokens']]),
                                      axis=1)

        return data
Пример #11
0
def read():
    np.random.seed(123)
    pipeline = spacy.load("en")
    return Readability(nlp=pipeline)
Пример #12
0
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            merge = True
            if ent.label_ in skip:
                merge = False
            if ent.label_ == 'DATE' and re.match(months, ent.text.lower()):
                merge = True
            if merge == True:
                attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
                retokenizer.merge(ent, attrs=attrs)
    return doc
nlp.add_pipe(skip_ents, after='ner')

# Test for the spacy-readability module
if collect_readability_scores == True:
    nlp.add_pipe(Readability())

# Load the sources file  
with open('sources_csv', 'r') as f:
    sources = [dict(line) for line in csv.DictReader(f)]

# The Document class
class Document():
    """Model a document's features.

    Parameters:
    - manifest_dir: the path to the manifest directory
    - manifest_file: the name of the manifest file.
    - content_property: the name of the property from which to extract the content

    Returns a dataframe.
Пример #13
0
def nlp():
    np.random.seed(123)
    pipeline = spacy.load("en")
    pipeline.add_pipe(Readability(nlp=pipeline))
    return pipeline
Пример #14
0
def nlp():
    pipeline = spacy.load("en")
    pipeline.add_pipe(Readability())
    return pipeline
Пример #15
0
def read():
    return Readability()
Пример #16
0
def ProcessText(model: str, text: str):
    nlp = spacy.load(model)
    nlp.max_length = 3000000
    nlp.add_pipe(Readability(), last=True)
    doc = nlp(text)
    return doc2json(doc, model)
Пример #17
0
def read_ger():
    pipeline = spacy.load("de_core_news_sm")
    np.random.seed(123)
    return Readability(nlp=pipeline)