Пример #1
0
def simplicity(string):
    tokens = nltk.word_tokenize(string)
    DC = readability.getmeasures(tokens)['readability grades']['DaleChallIndex']
    GF = readability.getmeasures(tokens)['readability grades']['GunningFogIndex']
    C = readability.getmeasures(tokens)['readability grades']['Coleman-Liau']
    #print(f'DC: {DC}, GF: {GF}, K: {K}')
    return DaleChall * DC + GunningFog * GF + Cole * C
Пример #2
0
    def __init__(self, tweet_attributes):

        self.num_tweets = tweet_attributes['num_tweets']
        self.num_hashtags = tweet_attributes['num_hashtags']
        self.num_mentions = tweet_attributes['num_mentions']
        self.num_urls = tweet_attributes['num_urls']
        self.num_media = tweet_attributes['num_media']
        self.num_symbols = tweet_attributes['num_symbols']
        self.num_polls = tweet_attributes['num_polls']
        self.text = tweet_attributes['text']
        self.tokens = tweet_attributes['tokens']

        self.token_lengths = [len(token) for token in self.tokens]
        self.tokens_lower = [token.lower() for token in self.tokens]
        self.pos_tags = nltk.pos_tag(self.tokens)
        self.sentences = sent_tokenize(self.text)
        self.num_emojis = emoji.core.emoji_count(self.text)
        self.norm_neg_senti, self.norm_neu_senti, self.norm_pos_senti, self.norm_com_senti = self.normalized_sentiment_values(
        )

        try:
            self.readability_measures = readability.getmeasures(self.tokens,
                                                                lang='en')
        except:
            self.readability_measures = readability.getmeasures(self.tokens +
                                                                ['a'],
                                                                lang='en')

        self.make_char_wise_pass()
        self.make_pos_tag_wise_pass()
        self.make_token_wise_pass()
        self.make_sliding_window_pass()

        self.num_errors = 0
Пример #3
0
def save_other_features(data, parse_lst_path, config, path, context=True, parse=True, multi=False):
    if multi:
        if 'complexity' in data:
            data_head, data_tail = multi_data(data[['id','corpus','sentence','token','complexity','class']])
        else:
            data_head, data_tail = multi_data(data[['id','corpus','sentence','token']])
        base, ext = os.path.splitext(path)
        path_head = base+'_head'+ext
        path_tail = base+'_tail'+ext
        omit = save_other_features(data_head, parse_lst_path, config, path_head, context=context, parse=parse)
        _    = save_other_features(data_tail, parse_lst_path, config, path_tail, context=context, parse=parse)
        multi_compute(data, path, path_head, path_tail, omit)
        return
    # based on aspect word
    data['word_len'] = data['token'].str.len().to_numpy()
    data['num_syllables'] = data['token'].apply(lambda x: syllables.estimate(str(x))).to_numpy()
    data['num_hyponyms'] = data.apply(lambda x: len(get_hyponyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy()
    data['num_hypernyms'] = data.apply(lambda x: len(get_hypernyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy()
    data['is_acronym'] = (data['token'].str.isupper()*1).to_numpy()
    data['is_pronoun'] = (data['token'].apply(lambda x: x[0].isupper())*1).to_numpy()
    # based on context
    omit = set()
    if context:
        corpus_dummies = pd.get_dummies(data['corpus'], prefix='corpus')
        for corpus_name in corpus_dummies:
            data[corpus_name] = corpus_dummies[corpus_name]
            omit.add(corpus_name)
        tagdict = load('help/tagsets/upenn_tagset.pickle')
        tags = [tag for tag in tagdict.keys() if tag[0] not in punctuation]
        POS = data.apply(lambda x: get_POS(x['sentence'], x['token']), axis=1)
        for tag in tags:
            data['POS_'+tag] = (POS == tag) * 1
        funcs = ["textstat." + func[0] for func in inspect.getmembers(textstat, predicate=inspect.ismethod)]
        for elem in tqdm(funcs):
            method = eval(elem)
            if method.__name__ in ['difficult_words_list', 'set_lang', 'text_standard', 'dale_chall_readability_score_v2', 'dale_chall_readability_score', 'gunning_fog', 'spache_readability', 'avg_sentence_length', 'avg_sentence_per_word', 'sentence_count', 'difficult_words', 'is_difficult_word', 'is_easy_word', 'smog_index']:
                continue
            textstat.set_lang("en")
            data[method.__name__] = data['sentence'].apply(lambda x: method(x)).to_numpy()
            omit.add(method.__name__)
        data['SMOGIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['SMOGIndex']).to_numpy()
        data['DaleChallIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['DaleChallIndex']).to_numpy()
        omit.add('SMOGIndex'); omit.add('DaleChallIndex')
        if parse and parse_lst_path is not None:
            parse_lst = pkl.load(open(parse_lst_path, 'rb'))
            parse_tree_depths = []
            token_depths = []
            num_words_at_depths = []
            for parse_tree, token in tqdm(zip(parse_lst, data['token'])):
                parse_tree_depths.append(parse_tree.height())
                token_depths.append(token_depth(parse_tree, token))
                num_words_at_depths.append(num_words_at_depth(parse_tree, token_depths[-1]))
            data['parse_tree_depth'] = np.array(parse_tree_depths).astype(np.int64)
            omit.add('parse_tree_depth')
            data['token_depth'] = np.array(token_depths).astype(np.int64)
            data['num_words_at_depth'] = np.array(num_words_at_depths).astype(np.int64)
    data.to_csv(path, sep='\t')
    return omit
Пример #4
0
def analyze_readability_measures(text, include_word_types=False):
    import readability
    import traceback
    from collections import OrderedDict
    tokenized = sentence_per_line_tokenize(text)
    res = pd.Series()
    tokenized_sentences = [
        sent.split() for sent in tokenized.lower().split('\n')
    ]
    toks_flat = [w for sent in tokenized_sentences for w in sent]
    if len(toks_flat) == 0:
        return {}  # invalid...
    res['mtld'] = mtld(toks_flat)
    try:
        readability_measures = readability.getmeasures(tokenized)
    except Exception:
        traceback.print_exc()
    else:
        for k, v in readability_measures.pop('sentence info').items():
            res[k] = v
        num_words = res['words']
        num_sents = res['sentences']
        if include_word_types:
            for word_type, count in readability_measures.pop(
                    'sentence beginnings').items():
                res[f'word_type_sent_startswith_{word_type}'] = count / num_sents
            for typ, count in readability_measures.pop('word usage').items():
                res[f'word_type_overall_{typ}'] = count / num_words
        for k in 'wordtypes long_words complex_words'.split():
            res[k] = res[k] / num_words
        # res.update(readability_measures.pop('readability grades'))
    return res
Пример #5
0
def generate(text, debug=False):
    """
    Generates the following readability scores for a user's tweets:

    -   Automated Readability Index (R_ARI)
    -   Coleman-Liau Index          (R_COL)
    -   Flesch Reading Ease         (R_FRE)
    -   Gunning-Fog Index           (R_GUN)
    -   Kincaid Grade Level         (R_KIN)
    -   LIX                         (R_LIX)
    -   SMOG Grade                  (R_SMOG)

    :param text: a String containing all tokenized sentences of a user, divided by newline characters (\n).
    :param debug: a Bool indicating if debugging information should be printed (default: False).
    :return: a Dict containing the feature names as keys and calculated lengths as values.
    """

    measures = readability.getmeasures(text, lang='nl', merge=True)
    if debug:
        for key, value in measures.items():
            print(key, ": ", value)

    return {
        "R_ARI": measures["ARI"],
        "R_COL": measures["Coleman-Liau"],
        "R_FRE": measures["FleschReadingEase"],
        "R_GUN": measures["GunningFogIndex"],
        "R_KIN": measures["Kincaid"],
        "R_LIX": measures["LIX"],
        "R_SMOG": measures["SMOGIndex"]
    }
Пример #6
0
def measure_file(srcpath, dstpath):
    print(srcpath)

    with open(srcpath, 'rt', encoding='utf-8') as srcfile:
        text = srcfile.read()

    for pat, repl in patterns:
        text = re.sub(pat, repl, text)

    results = readability.getmeasures(text, lang='en')
    measure = results['readability grades']['GunningFogIndex']

    with open(dstpath, 'wt', encoding='utf-8') as dstfile:
        for key in results:
            print('\t', key, sep='', file=dstfile)
            value = results[key]
            for key2 in value:
                value2 = value[key2]
                if isinstance(value2, float):
                    value2 = round(value2, 2)
                print('\t\t', f"{key2} {value2}", sep='', file=dstfile)
        dstfile.write(text)

    #os.remove(dstpath)
    return round(measure, 2)
Пример #7
0
def get_q2_readability_score(row):
    text = row['question2']
    try:
        results = readability.getmeasures(text, lang='en')
    except:
        return 0.00
    return results['readability grades']['FleschReadingEase']
def generate_candidates(bg, rd_th, rp_th, clist, max_len=10):
    '''
    recurse me baby
    '''
    candidate[bg] = {'rd': 0, 'rp': 0}
    if len(bg.split(' ')) > 1:
        candidate[bg]['rd'] = readability.getmeasures(
            bg, lang='en')['readability grades']['FleschReadingEase']
        candidate[bg]['rp'] = representativeness(bg)
    gen_list = [i for i in clist]

    if candidate[bg]['rd'] < rd_th or candidate[bg]['rp'] < rp_th or len(
            bg.split(' ')) > max_len:
        return
    else:
        cl = []
        for i in gen_list:
            tmp = i.split(' ')
            tmp.reverse()
            val = ' '.join(bg.split(' ')[-2:])
            if bg.split(' ')[-1] == tmp[1] and val != ' '.join(tmp):
                cl.append(bg + ' ' + ' '.join(i.split(' ')[1:]))

    for gram in cl:
        generate_candidates(gram, 0.1, 0, clist)
Пример #9
0
def main():
    post_path: str = sys.argv[1]

    print('-' * 80)
    print('post: {}'.format(post_path))
    print('-' * 80)

    with open(post_path) as f_in:
        data = json.load(f_in)
    content: str = data["content"]
    soup = BeautifulSoup(content, features="lxml")

    # Last line is at the bottom of the article before footnotes, so delete
    # all footnotes
    last_line: bs4.element.Tag = soup.find_all("hr")[-1]
    for elem in last_line.find_all_next():
        elem.decompose()

    paragraphs = soup.find_all('p')

    # Delete footnote links
    for paragraph in paragraphs:
        for footnote_link in paragraph.find_all('sup'):
            footnote_link.decompose()

    paragraph_strings: List[str] = [
        p.get_text().replace('\n', ' ') for p in paragraphs
    ]

    with open("/tmp/test/2fa.txt", "w") as f_out:
        for paragraph_string in paragraph_strings:
            f_out.write("{}\n\n".format(paragraph_string))

    nlp = English()
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    sentences: List[str] = [
        sentence for paragraph_string in paragraph_strings
        for sentence in get_sentences(paragraph_string, nlp)
    ]

    readability_results = readability.getmeasures(sentences, lang='en')
    pprint.pprint(readability_results['readability grades'])

    for (key, value) in readability_results['readability grades'].items():
        if key in thresholds_lookup:
            thresholds = thresholds_lookup[key]
            for i, pair in enumerate(zip(thresholds, thresholds[1:])):
                if (value >= pair[0]
                        and value <= pair[1]) or (pair[0] >= value
                                                  and pair[1] <= value):
                    if i == 0:
                        print(colored("{}: {:.2f}".format(key, value),
                                      "green"))
                    elif i == 1:
                        print(
                            colored("{}: {:.2f}".format(key, value), "yellow"))
                    else:
                        print(colored("{}: {:.2f}".format(key, value), "red"))
Пример #10
0
def readability_stats(text):
    stats = readability.getmeasures(text, lang='en')
    a = stats['sentence info']['words']
    b = stats['sentence info']['syll_per_word']
    c = stats['sentence info']['syllables']
    d = stats['sentence info']['long_words']
    e = stats['sentence info']['complex_words']
    return a, b, c, d, e
Пример #11
0
 def get_score(text):
     print("------------READABILITY -------------------------------------")
     results = rd.getmeasures(text, lang='en')
     mediane, taux_accord = calcul_readability(results)
     print(
         mediane, "(", taux_accord,
         ")\n-------------------------------------------------------------")
     return mediane, taux_accord
Пример #12
0
    def text_readability(self, text):
        """
        Calculate the readability for a given text.

        :return: readability measure
        :rtype: float
        """
        results = readability.getmeasures(text, lang='en')
        return results['readability grades']['FleschReadingEase']
Пример #13
0
def post_text():
    """receives POST req from node server, runs readability
    method, responds with jsonified dict"""
    text = request.json['content']
    results = readability.getmeasures(text, lang='en')
    # print('RATING \n', results['readability grades'])
    # print(text['content'], 'TEXT')

    return jsonify(results)
Пример #14
0
    def all_readbility_measures(text):
        list_of_all_measures = [
            readability.getmeasures(t, lang="en")['readability grades']
            for t in text
        ]

        df = pd.DataFrame.from_records(list_of_all_measures)

        return df
Пример #15
0
 def compute_readability(text, length_normalize=False):
     # readability.getmeasures(text)
     R = readability.getmeasures(text.decode('unicode-escape'),
                                 lang=u'en',
                                 merge=True)
     score = R["Kincaid"]  # Flesh - Kincaid score
     if length_normalize:
         score = score / float(len(text.split()))
     return score
Пример #16
0
    def get_read_measure(self):
        value_list = []
        for cat, data in list(readability.getmeasures(self._sentence_data, lang='en').items()):
            print(('%s:' % cat))
            for key, val in list(data.items()):
                print((('    %-20s %12.2f' % (key + ':', val)).rstrip('0 ').rstrip('.')))

            value_list.append(val)
        return val
def extract_readability_features(text):
    text = re.sub(r'\.', '.\n', text)
    text = re.sub(r'\?', '?\n', text)
    text = re.sub(r'!', '!\n', text)
    features = dict(readability.getmeasures(text, lang='en'))
    result = {}
    for d in features:
        result.update(features[d])
    result = pd.Series(result)
    return result
Пример #18
0
def readability_stats(dataframe, row, i, current_column, new_column,
                      readability_group, readability_measure):
    this_comment = row[current_column]
    tokenized = '\n\n'.join('\n'.join(' '.join(token.value
                                               for token in sentence)
                                      for sentence in paragraph)
                            for paragraph in segmenter.analyze(this_comment))
    this_result = readability.getmeasures(tokenized, lang='en')
    c.df[dataframe].at[
        i, new_column] = this_result[readability_group][readability_measure]
def get_readability(text):
    """
    return readability Length of the sentence, in words and characters; Flesch-Kincaid Grade Level score.
    """
    blob = TextBlob(text)
    results = readability.getmeasures(text, lang='en')
    return [
        len(blob.words),
        round(results['readability grades']['FleschReadingEase'], 2)
    ]
Пример #20
0
def extract_readability_features(text):
    text = re.sub(r"\.", ".\n", text)
    text = re.sub(r"\?", "?\n", text)
    text = re.sub(r"!", "!\n", text)
    features = dict(readability.getmeasures(text, lang="en"))
    result = {}
    for d in features:
        result.update(features[d])
    del result["paragraphs"]
    result = pd.Series(result)
    return result
Пример #21
0
def infer_readability(text):
    """

    """
    if text is None or len(text) == 0:
        return None
    try:
        measures = readability.getmeasures(text)
    except:
        return None
    return measures
 def _get_score(self, preprocessed_summary, dataset, score):
     flesch_score = []
     for i in range(len(preprocessed_summary)):
         temp_sentence_list = tokenize.sent_tokenize(dataset[i, 0])
         temp_score_list = []
         for j in range(len(temp_sentence_list)):
             if(any(c.isalpha() for c in temp_sentence_list[j])):
                 results = readability.getmeasures(temp_sentence_list[j], lang='en')
                 temp_score_list.append(results['readability grades'][score])
         flesch_score.append(min(temp_score_list))
     return np.array(flesch_score).reshape(-1,1)
def difficulty(df):
    """
    Calculates the difficulty of the song's lyrics. Intended to be used with an apply function.
    :param df: Dataframe of the potential one-hit wonders
    :return: The readability score
    """
    if df['foreign_language'] == 1:
        return np.nan
    else:
        text = df['lyric_difficulty']
        results = readability.getmeasures(text, lang='en')
        return results['readability grades']['FleschReadingEase']
Пример #24
0
def informality_features(text, text_id=0, complexity=False):
        #for readability, sentences should be separated by '\n'
        results = readability.getmeasures(text, lang='en')
        informality_scores = [score[1] for score in results['readability grades'].items()]

        #reads the semanticComplexity from output_file.txt on line order matters as the id should match the line
        if complexity:
                complexity = pd.read_csv(cwd+"/res/complexity/output_file.csv")
                complexity_score = complexity.iloc[text_id].values.tolist()
                informality_scores.extend(complexity_score)

        return informality_scores
Пример #25
0
def master_text(text):
    #get extractive
    ex = generate_summary(text)

    #print("Finished Extractive Summary")

    #get abstractive
    fn = os.path.join(os.path.dirname(__file__), './story/a.story')

    with open(fn, 'w') as f:
        f.write(text)

    ab = abstractive_summary()

    #print("Finished Abstractive Summary")

    #get keywords
    kw, definitions = get_keywords(text)

    #print("Finished getting keywords")

    #other data
    tags = get_tags(text)

    #print("Finished getting tags")

    n = graph_sentiment(text)

    #   print("Finished graphing sentiment")

    results = readability.getmeasures(text, lang='en')
    r = str(results['readability grades']['Kincaid'] / 2)
    t = float(r)

    if (t > 90):
        r += " - Very easy to read. Easily understood by an average 11-year-old student."
    elif (t > 80):
        r += " - Easy to read. Conversational English for consumers."
    elif (t > 70):
        r += " - Fairly easy to read"
    elif (t > 60):
        r += " - Plain English. Easily understood by 13- to 15-year-old students."
    elif (t > 50):
        r += " - Fairly difficult to read."
    elif (t > 30):
        r += " - Difficult to read."
    else:
        r += " - Very difficult to read. Best understood by university graduates."

    print([ex, ab, kw, definitions, tags, r, n])

    return [ex, ab, kw, definitions, tags, r, n]
Пример #26
0
def add_features_to_df(X):
    rows_to_add = []
    for _, row in X.iterrows():
        text = row[TEXT_COLUMN_NAME]
        readability_features = getmeasures(text)

        row_to_add = {}
        row_to_add.update(readability_features['readability grades'])

        rows_to_add.append(row_to_add)
    df_to_add = pd.DataFrame(rows_to_add)
    X_merged = pd.concat([X, df_to_add], axis=1)
    return X_merged
def main():
    features_data_file = 'data/allreadability.pickle'
    features_object = {1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: []}
    unwanted_features = [
        'paragraphs',
        'words',
        'characters',
        'sentences_per_paragraph',
        'words_per_sentence',
    ]
    final_array = None
    data_file_path = 'data/training_set_rel3.tsv'
    data = open(data_file_path, encoding="ISO-8859-1")
    lines = data.readlines()
    data.close()
    for index, line in enumerate(lines[1:]):
        if index % 50 == 0:
            print(f"processed {index} essays")
        tokens = line.strip().split('\t')
        essay_id = int(tokens[0])
        essay_set = int(tokens[1])
        content = tokens[2].strip()
        score = tokens[6]
        sent_tokens = text_tokenizer(content,
                                     replace_url_flag=True,
                                     tokenize_sent_flag=True)
        sentences = [' '.join(sent) + '\n' for sent in sent_tokens]
        sentences = ''.join(sentences)
        readability_scores = readability.getmeasures(sentences, lang='en')
        features = [essay_id]
        for cat in readability_scores.keys():
            for subcat in readability_scores[cat].keys():
                if subcat not in unwanted_features:
                    ind_score = readability_scores[cat][subcat]
                    features.append(ind_score)
        features_object[essay_set].append(features)
    for key in features_object.keys():
        features_object[key] = np.array(features_object[key])
        min_v, max_v = features_object[key].min(
            axis=0), features_object[key].max(axis=0)
        features = (features_object[key] - min_v) / (max_v - min_v)
        features = np.nan_to_num(features)
        features = features_object[key]
        features_object[key][:, 1:] = features[:, 1:]
        if isinstance(final_array, type(None)):
            final_array = features_object[key]
        else:
            final_array = np.vstack((final_array, features_object[key]))

    with open(features_data_file, 'wb') as fp:
        pickle.dump(final_array, fp)
Пример #28
0
def readability_study(txt_file_name):
    f = open(txt_file_name, "r")

    text = f.read()

    measures = readability.getmeasures(text, lang="en")

    for cat, data in measures.items():
        print('%s:' % cat)
        for key, val in data.items():
            print(('    %-25s %12.2f' %
                   (key + ':', val)).rstrip('0 ').rstrip('.'))

    return measures
Пример #29
0
def run_readability(texts):
    out = []
    for text in texts:
        tokenized = '\n\n'.join(
            '\n'.join(
            ' '.join(token.value for token in sentence) 
            for sentence in paragraph) 
            for paragraph in segmenter.analyze(text))
        results = readability.getmeasures(tokenized, lang='en')
        data = {}
        for key in results:
            data[key.replace(' ', '')] = dict(results[key])
        out.append(data)
    return out
Пример #30
0
 def analyze_text(self, text):
     tokenized = self.tokenize_text(text)
     measures = readability.getmeasures(tokenized, lang='en')
     return {
         'grade_level':
         measures['readability grades']['Kincaid'],
         'words':
         measures['sentence info']['words'],
         'words_per_sentence':
         measures['sentence info']['words_per_sentence'],
         'sentences_per_paragraph':
         measures['sentence info']['sentences_per_paragraph'],
         'paragraphs':
         measures['sentence info']['paragraphs']
     }
Пример #31
0
 def readability_features(self):
   features = []
   content = '.'.join([self.title,self.short,self.need,self.essay])
   n_para = len(re.findall(r'\\n\\n', content))
   measures = getmeasures(sent_detector.tokenize(content)+['']*n_para)
   grades = measures['readability grades']
   features += grades.values()
   sent_info = measures['sentence info']
   features += sent_info.values()
   #features += map(lambda k: sent_info[k],\
   #['characters_per_word', 'syll_per_word', 'words_per_sentence', 'characters','syllables', 'words', 'sentences', 'long_words', 'complex_words'])
   word_usage = measures['word usage']
   features += word_usage.values()
   sent_begin = measures['sentence beginnings']
   features += sent_begin.values()
   return tuple(features)
Пример #32
0
def score_statements(filename=DEFAULT_FILENAME, loglevel=logging.INFO, database=DB_PATH):
    sia = SentimentIntensityAnalyzer()
    for i, statement in enumerate(Statement.objects.iterator()):
        s = sia.polarity_scores(statement.text)
        score = Score(positive=s['pos'], negative=s['neg'], neutral=s['neu'], compound=s['compound'],
                      intensity=abs(s['compound']))
        words = statement.text.split()
        if len(words) and any(words):
            superficial_measures = getmeasures(words)
            score.flesch = superficial_measures['readability grades']['FleschReadingEase']
            score.kincaid = superficial_measures['readability grades']['Kincaid']
            score.dale_chall = textstat.dale_chall_readability_score(statement.text)
        else:
            score.flesch = 0
            score.kincaid = 0
            score.dale_chall = 0
        score.save()
        statement.score = score
        statement.save()
        print(statement.score)
    return i
Пример #33
0
def getreadabilitymeasures(numsents):
	"""Get readability of all files and store results in a dictionary."""
	try:
		import readability
	except ImportError:
		APP.logger.warning(
			'readability module not found; install with:\npip install'
			' https://github.com/andreasvc/readability/tarball/master')
		return {}
	files = glob.glob(os.path.join(CORPUS_DIR, '*.tok'))
	results = {}
	# consider a fixed number of sentences to get comparable results
	cutoff = min(numsents)
	for filename in sorted(files):
		name = os.path.basename(filename)
		# flatten results into a single dictionary of (key, value) pairs.
		results[name] = {key: value
				for data in readability.getmeasures(
						islice(io.open(filename, encoding='utf8'), cutoff),
						lang=LANG).values()
					for key, value in data.items()}
	return results