def count_fry_readability(data_list):
    sentence_numbers = 0
    syllables_numbers = 0
    # computation
    words_count = 0
    for sentence in data_list:
        sentence_numbers = sentence_numbers + 1
        syllables_numbers = syllables_numbers + textstat.syllable_count(sentence)
        words_count = words_count + textstat.lexicon_count(sentence)
        if words_count >= 150:
            break

    # computation
    words_count = 0
    for sentence in reversed(data_list):
        sentence_numbers = sentence_numbers + 1
        syllables_numbers = syllables_numbers + textstat.syllable_count(sentence)
        words_count = words_count + textstat.lexicon_count(sentence)
        if words_count >= 150:
            break

    avg_sentence_numbers = round(sentence_numbers / 3)
    avg_syllables_numbers = round(syllables_numbers / 3)

    return get_value_from_fry_graph(avg_sentence_numbers, avg_syllables_numbers)
示例#2
0
def test_lexicon_count():
    textstat.set_lang("en_US")
    count = textstat.lexicon_count(long_test)
    count_punc = textstat.lexicon_count(long_test, removepunct=False)

    assert count == 372
    assert count_punc == 376
def lexicon_table():
    lst_1 = [
        textstat.lexicon_count(text) for text in insincere_questions.tolist()
    ]
    lst_2 = [
        textstat.lexicon_count(text) for text in sincere_questions.tolist()
    ]
    table = build_table(lst_1, lst_2)
    py.plot(table, filename='lexicon_table')
def build_lexicon_dict(lst):
    dct = {}
    for t in lst:
        if textstat.lexicon_count(t) in dct:
            dct[textstat.lexicon_count(t)] = dct[textstat.lexicon_count(t)] + 1
        else:
            dct[textstat.lexicon_count(t)] = 1
    for key in dct.keys():
        dct[key] = dct[key] / len(lst)
    sorted_tuple = sorted(dct.items(), key=operator.itemgetter(0))
    return sorted_tuple
示例#5
0
    def transform(self, input_df: pd.DataFrame) -> coo_matrix:
        """
        It computes and returns the linguistic features from the input DF. The DF must include the following attributes
        in its columns: Q_TEXT, Q_ID
        :param input_df:
        :return:
        """
        if Q_TEXT not in input_df.columns:
            raise ValueError("Q_TEXT should be in input_df.columns")
        if Q_ID not in input_df.columns:
            raise ValueError("Q_ID should be in input_df.columns")

        correct_ans_text_dict = gen_correct_answers_dict(input_df)
        wrong_ans_text_dict = gen_wrong_answers_dict(input_df)

        df = pd.DataFrame()
        df['lexicon_count_question'] = input_df.apply(
            lambda r: textstat.lexicon_count(r[Q_TEXT]), axis=1)
        df['lexicon_count_correct_choices'] = input_df.apply(
            lambda r: np.mean([
                textstat.lexicon_count(x)
                for x in correct_ans_text_dict[r[Q_ID]]
            ]),
            axis=1)
        df['lexicon_count_wrong_choices'] = input_df.apply(lambda r: np.mean(
            [textstat.lexicon_count(x) for x in wrong_ans_text_dict[r[Q_ID]]]),
                                                           axis=1)
        df['sentence_count_question'] = input_df.apply(
            lambda r: textstat.sentence_count(r[Q_TEXT]), axis=1)
        df['sentence_count_correct_choices'] = input_df.apply(
            lambda r: np.mean([
                textstat.sentence_count(x)
                for x in correct_ans_text_dict[r[Q_ID]]
            ]),
            axis=1)
        df['sentence_count_wrong_choices'] = input_df.apply(lambda r: np.mean(
            [textstat.sentence_count(x)
             for x in wrong_ans_text_dict[r[Q_ID]]]),
                                                            axis=1)
        df['avg_word_len_question'] = input_df.apply(
            lambda r: np.mean([len(x) for x in r[Q_TEXT].split(' ')]), axis=1)
        df['ratio_len_question_correct_choices'] = df.apply(
            lambda r: (1 + r['lexicon_count_question']) /
            (1 + r['lexicon_count_correct_choices']),
            axis=1)
        df['ratio_len_question_wrong_choices'] = df.apply(
            lambda r: (1 + r['lexicon_count_question']) /
            (1 + r['lexicon_count_wrong_choices']),
            axis=1)
        return coo_matrix(df.values)
def feature_getter(text):
    try:
        text=text.decode('utf-8')
    except:
        pass
    text1=re.sub(r'[^\x00-\x7F]+',' ', text)
    ##text1=re.sub('\n','. ', text)
    text=text1
    features=[]
    tokens=[]
    sentences = nltk.sent_tokenize(text)
    [tokens.extend(nltk.word_tokenize(sentence)) for sentence in sentences]
    
    syllable_count = textstat.syllable_count(text, lang='en_US')
    word_count = textstat.lexicon_count(text, removepunct=True)

    flesch = textstat.flesch_reading_ease(text)
    readability = textstat.automated_readability_index(text)

    features.append(len(sentences)) #num_sentences
    features.append(syllable_count) #num_sentences
    features.append(word_count) #num_sentences
    features.append(flesch) #num_sentences
    features.append(readability) #num_sentences       
    return features
def textstat_stats(text):
    doc_length = len(text.split()) 
    flesch_ease = ts.flesch_reading_ease(text) #Flesch Reading Ease Score
    flesch_grade = ts.flesch_kincaid_grade(text) #Flesch-Kincaid Grade Level
    gfog = ts.gunning_fog(text) # FOG index, also indicates grade level
#    smog = ts.smog_index(text) # SMOG index, also indicates grade level, only useful on 30+ sentences
    auto_readability = ts.automated_readability_index(text) #approximates the grade level needed to comprehend the text.
    cl_index = ts.coleman_liau_index(text) #grade level of the text using the Coleman-Liau Formula.
    lw_formula = ts.linsear_write_formula(text) #grade level using the Linsear Write Formula.
    dcr_score = ts.dale_chall_readability_score(text) #uses a lookup table of the most commonly used 3000 English words
#    text_standard = ts.text_standard(text, float_output=False) # summary of all the grade level functions
    syll_count = ts.syllable_count(text, lang='en_US')
    syll_count_scaled = syll_count / doc_length
    lex_count = ts.lexicon_count(text, removepunct=True)
    lex_count_scaled = lex_count / doc_length
    idx = ['flesch_ease', 'flesch_grade','gfog',
           'auto_readability','cl_index','lw_formula',
           'dcr_score', 
#           'text_standard', 
           'syll_count', 'lex_count']
    return pd.Series([flesch_ease, flesch_grade, gfog, 
                      auto_readability, cl_index, lw_formula, 
                      dcr_score, 
#                      text_standard, 
                      syll_count_scaled, lex_count_scaled], index = idx)
示例#8
0
def calculate_stats(data_folder):
    """Calculate stat of test.json file in a folder"""
    data_folder = Path(data_folder)
    for dataset in dataset_fields:
        print(f"loading {dataset}")
        field = dataset_fields[dataset]["text"].strip()
        sentences = []
        for item in json.load(open(data_folder / dataset / "test.json")):
            sentences.append(item[field][-1] if type(item[field]) == list else item[field])

        text = " ".join(sentences)
        lex_count = textstat.lexicon_count(text)
        print(lex_count)
        unique_words = count_words(text)
        print(f"all unique {len(unique_words)}")

        lower_unique_words = count_words(text, casing="lower")
        print(f"lowercase unique {len(lower_unique_words)}")

        upper_unique_words = count_words(text, casing="upper")
        print(f"uppercase unique {len(upper_unique_words)}")

        print(f"ratio {len(upper_unique_words) / len(unique_words)}")

        text_standard = textstat.text_standard(text, float_output=True)
        print(f"text_standard: {text_standard}")

        dale_chall_readability_score = textstat.dale_chall_readability_score(text)
        print(f"dale_chall_readability_score: {dale_chall_readability_score}")

        flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
        print(f"flesch_kincaid_grade: {flesch_kincaid_grade}")
示例#9
0
def textStatistics(text):
    """
    returns text statistics such as lexicon count and text standard in a tuple
    """
    le_c = textstat.lexicon_count(text, removepunct=True)
    ts = textstat.text_standard(text, float_output=True)

    return le_c, ts
示例#10
0
 def statistics(self, text):
     self.asl = textstat.avg_sentence_length(text)
     self.avg_sentence_per_word = textstat.avg_sentence_per_word(text)
     self.avg_syllables_per_word = textstat.avg_syllables_per_word(text)
     self.difficult_words = textstat.difficult_words(text)
     self.lexicon_count = textstat.lexicon_count(text)
     self.polysyllable_count = textstat.polysyllabcount(text)
     self.sentence_count = textstat.sentence_count(text)
def flesch_kincaid(row):
    text = row['reviewText']
    words = max(1, textstat.lexicon_count(text))
    sentences = max(1, sentence_count(row))
    syllables = textstat.syllable_count(text, lang='en_US')
    score = 206.835 - 1.015 * (float(words) /
                               sentences) - 84.6 * (float(syllables) / words)
    return score
示例#12
0
def get_raw_stats(book, text):
    return {
        'total_words': textstat.lexicon_count(text),
        'total_sentences': len(sent_tokenize(text)),
        'total_letters': textstat.letter_count(text),
        'total_syllables': textstat.syllable_count(text),
        # 'total_paragraphs': len(get_paragraphs(text)),
        # 'average_word_difficulty': get_average_frequency(book)
    }
 def __init__(self, text):
     self.__doc = self.preprocess(text)
     self.__docWords = self.getTotalWords()
     self.__totalWords = textstat.lexicon_count(self.__doc, removepunct=True)
     self.__totalCharacters = textstat.char_count(self.__doc, ignore_spaces=True)
     self.__totalSentences = self.getSentencesCount()
     self.__totalSyllables = self.getSyllablesCount()
     # self.__totalSyllables = textstat.syllable_count(self.__doc)
     self.__polySyllableCount = self.getPolySyllableCount()
def count_raygor_readability(data_list):
    sentence_numbers = 0
    words_count_bigger_six = 0
    # computation
    words_count = 0
    for sentence in data_list:
        sentence_numbers = sentence_numbers + 1
        words_count = words_count + textstat.lexicon_count(sentence)
        words_count_bigger_six = words_count_bigger_six + len([1 for n in sentence.split() if len(n) > 6])
        if words_count >= 50:
            break

    # computation
    words_count = 0
    for sentence in reversed(data_list):
        sentence_numbers = sentence_numbers + 1
        words_count = words_count + textstat.lexicon_count(sentence)
        words_count_bigger_six = words_count_bigger_six + len([1 for n in sentence.split() if len(n) > 6])
        if words_count >= 50:
            break

    return get_value_from_raygor_graph(sentence_numbers, words_count_bigger_six)
示例#15
0
def feature_eng(df, text_col):
    org_cols = df.columns
    #length of original text
    df['Sent_length'] = df[text_col].apply(len)

    #more than 2 Exclamation mark (!!) - binary feature
    reg = re.compile("(!)\\1{1,}")
    has_more_than2_exm = lambda x: np.where(len(reg.findall(x)) == 0, 0, 1)

    df['more_that_2_exm'] = df[text_col].apply(has_more_than2_exm)
    df['words_count'] = df[text_col].apply(lambda x: textstat.lexicon_count(x))
    num_cols = [col for col in df.columns if col not in org_cols]
    return num_cols
def sentence_fit(gen_text, orig_text):
    df = pd.DataFrame(
        gen_text,
        columns=['generated'])  # Text generated from GPT2 stored in dataframe
    df['generated'] = df['generated'].str.replace(r' +,', ',').str.replace(
        r' +\.', '.')  # Remove spaces in front of punctuation
    df['similarity'] = df['generated'].apply(lambda x: text_similarity(
        orig_text, x))  # Assess cosine similarity betweeen sentences
    df['n_syll'] = df['generated'].apply(
        textstat.syllable_count)  # Count number of syllables
    df['n_lex'] = df['generated'].apply(
        textstat.lexicon_count)  # Count number of words
    df['syll_lex'] = df['n_syll'] / df['n_lex']  # Syllable to word ratio

    # Flags to indicate whether generated text has fewer words, syallables, or syll to word ratio
    df['rel_syll'] = np.where(
        df['n_syll'] < textstat.syllable_count(orig_text), 1, 0)
    df['rel_lex'] = np.where(df['n_lex'] < textstat.lexicon_count(orig_text),
                             1, 0)
    df['rel_rat'] = np.where(
        df['syll_lex'] <
        textstat.syllable_count(orig_text) / textstat.lexicon_count(orig_text),
        1, 0)

    # Sum binary indicators of relative sentence simplicity
    df['rel_simp'] = (df['rel_syll'] + df['rel_lex'] + df['rel_rat']) / 3

    # Fit score is weighted sum of similarity and relative sentence simplicity
    # Highest score will be chosen
    df['fit_score'] = 0.7 * df['similarity'] + 0.3 * df['rel_simp']

    # Subset data and rename columns
    df['Original'] = orig_text
    df = df[['Original', 'generated', 'similarity', 'rel_simp', 'fit_score']]
    df.columns = [
        'Original', 'Generated', 'Similarity', 'Simplicity', 'Fit Score'
    ]

    return df
def word_count(class_name):
    total_word = 0
    for idx, items in enumerate(df['genre']):
        if items == class_name:
            book_id = df.at[idx, 'book_id']
            content = df.at[idx, 'content']
            no_of_words = textstat.lexicon_count(str(content),
                                                 removepunct=True)
            total_word = total_word + no_of_words
            det_and_mys_dict[book_id] = (no_of_words)

    min_word_id = min(det_and_mys_dict.items(), key=lambda x: x[1])
    max_word_id = max(det_and_mys_dict.items(), key=lambda x: x[1])
    return (min_word_id, max_word_id, total_word)
示例#18
0
def analyze_vocab(text):
    return {
        'num_words': textstat.lexicon_count(text),
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'smog_index': textstat.smog_index(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'automated_readability_index':
        textstat.automated_readability_index(text),
        'dale_chall_readability_score':
        textstat.dale_chall_readability_score(text),
        'difficult_words': textstat.difficult_words(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'gunning_fog': textstat.gunning_fog(text),
        'text_standard': textstat.text_standard(text, float_output=True)
    }
def lisibilty(text):

    f_lis = ([
        textstat.syllable_count(str(text), lang='en_arabic'),
        textstat.lexicon_count(str(text), removepunct=True),
        textstat.sentence_count(str(text)),
        textstat.flesch_reading_ease(str(text)),
        textstat.flesch_kincaid_grade(str(text)),
        textstat.gunning_fog(str(text)),
        textstat.smog_index(str(text)),
        textstat.automated_readability_index(str(text)),
        textstat.coleman_liau_index(str(text)),
        textstat.linsear_write_formula(str(text)),
        textstat.dale_chall_readability_score(str(text))
    ])
    return f_lis
def dale_chall(row):
    text = row['reviewText']
    easywords = open("easy_words.txt").read().splitlines()
    words = tokenize.word_tokenize(text)
    words = list(map(lambda x: x.lower(), words))
    sentences = max(1, sentence_count(row))
    wordcount = max(1, textstat.lexicon_count(text))
    easywordcount = 0
    for easyword in easywords:
        easywordcount += words.count(easyword)
    diffwordsratio = (float(wordcount - easywordcount) / wordcount)
    score = 0.1579 * (diffwordsratio * 100) + 0.0496 * (float(wordcount) /
                                                        sentences)
    if diffwordsratio > 0.05:
        score += 3.635
    return score
def add_features(row):
    '''Feature engineering via NLP.'''
    text = row.text
    doc = nlp(text)
    lemmas = list()
    entities = list()
    for token in doc:
        if token.text == ':':
            row['has_colon'] = 1
        if token.text == ';':
            row['has_semicolon'] = 1
        if token.text == '-':
            row['has_dash'] = 1
        if token.text.lower() == 'whom':
            row['whom'] = 1
        if token.text[-3:] == 'ing':
            row['num_ings'] += 1
        if token.text.lower() == 'had':
            row['has_had'] = 1
        pos = token.pos_
        row[pos] += 1
        if token.is_stop or not token.is_alpha:
            continue
        lemma = token.lemma_.strip().lower()
        if lemma:
            lemmas.append(lemma)
    for ent in doc.ents:
        entities.append(ent.text)
    lemmas = ' '.join(lemmas)
    blob = TextBlob(text)
    row['subjectivity'] = blob.sentiment.subjectivity
    row['polarity'] = blob.sentiment.polarity
    row['starts_conj'] = int(doc[0].pos_ == 'CONJ')
    row['ends_prep'] = int(doc[0].pos_ == 'PREP')
    row['entities'] = entities
    row['lemmas'] = lemmas
    row['raw_text_length'] = len(text)
    row['num_words'] = len(doc)
    row['avg_word_len'] = row.raw_text_length / row.num_words
    row['vector_avg'] = np.mean(nlp(lemmas).vector)
    row['num_ings'] /= row['num_words']
    row['rhyme_frequency'] = rhyme_frequency(row['text'])
    row['dale_chall'] = textstat.dale_chall_readability_score(row['text'])
    row['FleischReadingEase'] = textstat.flesch_reading_ease(row['text'])
    row['lexicon'] = textstat.lexicon_count(row['text'])
    row['word_diversity'] = row.lexicon / row.num_words
    return row
def get_desc_data(string):
    '''
    Input: book description string
    Output: returns desc_len, num_unique_words, avg_word_len
    '''
    #Data before text processing
    desc_semantic = get_semantic(string)
    syl_count = syllable_count(string)
    lex_count = lexicon_count(string)
    sent_count = sentence_count(string)
    flesch = flesch_reading_ease(string)

    #Data after text processing
    string = text_preprocess(string)
    word_cnt = word_count(string)
    description_len = desc_len(string)
    number_unique_words = num_unique_words(string)
    average_word_len = avg_word_len(string)
    return desc_semantic, word_cnt, description_len, number_unique_words, \
           average_word_len, syl_count, lex_count, sent_count, flesch
示例#23
0
def process_datum(datum):
    # Remove tags
    soup = BeautifulSoup(datum["Content"], features="html.parser")

    clean_soup = BeautifulSoup(datum["Content"], features="html.parser")
    for elm in clean_soup(["code"]):
        elm.extract()

    body_text = clean_soup.get_text()

    pos_tags = pos_tag(word_tokenize(body_text))

    pos_counts = Counter([tag for word, tag in pos_tags])
    # preterm_counts =

    result = {}
    result['TEXT'] = body_text
    result['CT1'] = lexicon_count(body_text)
    result['CT2'] = sentence_count(body_text)
    for tag in POS_TAGS:
        result['CT3.' + tag] = pos_counts[tag]
    # for preterm in PRETERMINALS:
    # results['CT4.' + preterm] =
    result['CN1'] = len(soup.find_all("code", href=True)) +\
        len(soup.find_all("img", href=True)) +\
        len(soup.findAll("span", {"class": "math-container"}))
    result['CN2'] = len(soup.find_all("a", href=True))
    result['U1.SUM'] = datum['U1.SUM']
    result['U1.1'] = datum['U1.1']
    result['U1.2'] = datum['U1.2']
    result['U2'] = datum['U2']
    result['Y1'] = datum['Y1']
    result['Y2'] = datum['Y2']
    result['T'] = datum['T']
    result['S'] = datum['S']
    result['D'] = datum['D']

    return result
def lexical_counts(sent):
    return textstat.lexicon_count(sent, removepunct=True)
示例#25
0
    def test_lexicon_count(self):
        count = textstat.lexicon_count(self.long_test)
        count_punc = textstat.lexicon_count(self.long_test, removepunct=False)

        self.assertEqual(372, count)
        self.assertEqual(376, count_punc)
def lexicon_count(corpus):
    return np.array([textstat.lexicon_count(doc)
                     for doc in corpus]).reshape(-1, 1)
from stop_words import safe_get_stop_words
from readability import Readability
import textstat

csv.field_size_limit(500 * 1024 * 1024)
file1 = open('gfi_description.csv')
csv_reader = csv.reader(file1)
header = next(csv_reader)

file2 = open('gfi_description_attributes.csv', 'w')
csv_writer = csv.writer(file2)
csv_writer.writerow([
    'id', 'num_url', 'num_image', 'num_code', 'num_comment', 'num_table',
    'count_word_title', 'count_word_body', 'readability'
])

for item in csv_reader:
    title = item[1]
    body = item[2]
    count_word_title = textstat.lexicon_count(title, removepunct=True)
    count_word_body = textstat.lexicon_count(body, removepunct=True)
    readability = textstat.coleman_liau_index(body)
    print(count_word_title, count_word_body, readability)
    csv_writer.writerow([
        item[0], item[3], item[4], item[5], item[6], item[7], count_word_title,
        count_word_body, readability
    ])

file1.close()
file2.close()
示例#28
0
def test_lexicon_count():
    count = textstat.lexicon_count(long_test)
    count_punc = textstat.lexicon_count(long_test, removepunct=False)

    assert count == 372
    assert count_punc == 376
示例#29
0
# In[46]:

print(average)

# In[33]:

textstat.flesch_reading_ease(text)

# In[40]:

textstat.automated_readability_index(text)

# In[35]:

textstat.lexicon_count(text, removepunct=True)

# In[22]:

textstat.syllable_count(text, lang='en_US')

# In[82]:

textstat.text_standard(text)

# In[9]:

len(set(cleaning_features(text)))

# In[7]:
示例#30
0
###############################################################################
# Readability scores: Greta Thunberg

import textstat
import numpy as np

# drop empty text fields
temp = greta.copy()
temp['text'].replace('', np.nan, inplace=True)
temp['text'].replace(' ', np.nan, inplace=True)
temp.dropna(subset=['text'], inplace=True)

temp['syl_count'] = temp.text.apply(lambda x: textstat.syllable_count(x))
temp['word_count'] = temp.text.apply(
    lambda x: textstat.lexicon_count(x, removepunct=True))
temp['sent_count'] = temp.text.apply(lambda x: textstat.sentence_count(x))
temp['score_fre'] = temp.text.apply(lambda x: textstat.flesch_reading_ease(x))
temp['score_are'] = temp.text.apply(
    lambda x: textstat.automated_readability_index(x))
temp['char_count'] = temp.text.apply(lambda x: len(x))

sns.distplot(temp.word_count,
             hist=True,
             kde=False,
             norm_hist=True,
             color='darkblue',
             hist_kws={'edgecolor': 'black'})

fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(nrows=2, ncols=2, figsize=(8, 6))
fig.subplots_adjust(hspace=.5)
示例#31
0
    sys.exit(0)


signal.signal(signal.SIGINT, handler)

d = "/home/adulau/dess/12/01"
ld = os.listdir(d)
stats = {}
stats['hits'] = 0
stats['miss'] = 0

for f in ld:
    currentfile = os.path.join(d, f)
    with gzip.open(currentfile) as paste:
        content = paste.read().decode('utf-8')
        lexicon = textstat.lexicon_count(content, removepunct=True)
        syllabe = textstat.syllable_count(content, lang='en_US')
        sentence = textstat.sentence_count(content)
        # consensus = textstat.text_standard(content, float_output=False)
        # print ("sentence={}, syllabe={}, lexicon={}, flesch_reading_score={},{}".format(sentence, syllabe, lexicon, textstat.flesch_reading_ease(content), currentfile))
        analysis = {}
        analysis['sentence'] = sentence
        analysis['syllabe'] = syllabe
        analysis['lexicon'] = lexicon
        analysis['flesch_reading_ease'] = textstat.flesch_reading_ease(content)
        analysis['filename'] = currentfile
        analysis['length'] = len(content)
        analysis['extract'] = content[:100]

        #rank = (analysis['flesch_reading_ease']+analysis['flesch_reading_ease']+analysis['lexicon'])*analysis['sentence']
        rank = analysis['flesch_reading_ease']