示例#1
0
def fkBLEU_file(source, preds, refs, preprocess=as_is, pass_indiv=False):
    files = [codecs.open(fis, "r", 'utf-8') for fis in [source, preds, refs]]
    scores = []
    references = []
    hypothese = []
    fkbleu = 0
    n = 0
    for src, pred, ref in izip(*files):
        references = [word_tokenize(preprocess(r)) for r in ref.split('\t')]
        hypothese = word_tokenize(preprocess(pred))
        source = word_tokenize(preprocess(src))
        ibleu = 0.9 * corpus_bleu(
            [references], [hypothese],
            smoothing_function=smooth.method3) - 0.1 * corpus_bleu(
                [source], [hypothese], smoothing_function=smooth.method3)
        try:
            fkdiff = textstat.flesch_reading_ease(
                ' '.join(hypothese)) - textstat.flesch_reading_ease(
                    ' '.join(source))
            n += 1
            fkdiff = 1 / (1 + np.exp(-fkdiff))
            fkbleu += fkdiff * ibleu
        except Exception:
            continue
    fkbleu /= n
    for fis in files:
        fis.close()
    # Smoothing method 3: NIST geometric sequence smoothing
    return ibleu
def complexityAlongtheText( f, chunk_length = 100 ):
	text = loadText(f)
	words = text.split()
	x = []
	y = []
	cur = 0
	average = textstat.flesch_reading_ease(text)

	while ( cur < len(words) ):
	    sub = words[cur:cur+chunk_length]
	    sub.append('.')
	    sub_text = ' '.join(sub)
	    y.append( 100 - textstat.flesch_reading_ease(sub_text)  )
	    x.append( cur)
	    cur += chunk_length
	    
	if average < 20:
	    col = colours[4]
	elif average < 40:
	    col = colours[6]
	elif average < 60:
	    col = colours[3]
	elif average < 80:
	    col = colours[1]
	else:
	    col = colours[0]
	plt.plot(x,y, color = [ 1.0 / 255.0 * c for c in col], alpha = 0.6, linewidth = 5)    
	plt.fill_between(x,y, color = [ 1.0 / 255.0 * c for c in col], alpha = 0.3)
	#     plt.plot( [0,max(x)], [average,average], color = 'gray')
	plt.ylim([0,100])
	plt.xlabel("number of words")
	plt.ylabel("difficulty")
	plt.show()
示例#3
0
def complexityAlongtheText(text, n_chunk=10):
    words = text.split()
    chunk_length = len(words) / n_chunk
    if chunk_length < 200:
        chunk_length = 200

    chunk_length = int(chunk_length)

    x = []
    y = []
    cur = 0
    # average = textstat.flesch_reading_ease(text)

    while (cur < len(words)):
        sub = words[cur:cur + chunk_length]
        sub.append('.')
        sub_text = ' '.join(sub)
        try:
            diff = 100 - textstat.flesch_reading_ease(sub_text)
            if diff < 100:
                y.append(100 - textstat.flesch_reading_ease(sub_text))
                x.append(cur)
        except:
            print "cannot compute complexity in 'complexityAlongtheText' "
        cur += chunk_length

    if len(y):
        average = float(sum(y)) / float(len(y))
    else:
        average = 0
    # print "average reading ease: %s "%average

    if average < 20:
        col = colours_ordered[0]
    elif average < 40:
        col = colours_ordered[1]
    elif average < 60:
        col = colours_ordered[2]
    elif average < 80:
        col = colours_ordered[3]
    else:
        col = colours_ordered[4]

    full_data = dict()
    data = []
    for i in range(0, len(y)):
        tmp = dict()
        tmp['x'] = x[i]
        tmp['y'] = y[i]
        # tmp['color'] = col
        data.append(tmp)

    full_data['values'] = data
    full_data['color'] = col
    #     plt.plot( [0,max(x)], [average,average], color = 'gray')

    return full_data
示例#4
0
def sentence_stats(s1, s2):
    #s2 should be predictions and s1 should be source
    try:
        fkdiff = textstat.flesch_reading_ease(
            s2) - textstat.flesch_reading_ease(s1)
    except Exception:
        fkdiff = 0.0
    doc1 = nlp(s1)
    doc2 = nlp(s2)
    ts = tree_sim(doc1, doc2) / 100
    ds = doc_sim(doc1, doc2)
    return (torch.FloatTensor([fkdiff, ts, ds]))
示例#5
0
def readability(text, file):
    fog = textstat.gunning_fog(text)
    fres = textstat.flesch_reading_ease(text)
    fkgl = textstat.flesch_kincaid_grade(text)
    file.write(
        '\nGunning Fog Index: %d \nFlesch Reading Ease: %d \nFlesch-Kincaid Grade: %d'
        % (fog, fres, fkgl))
示例#6
0
    def _calculate_scores(self, docs):
        docs_scores = []

        for doc in docs:
            scores = {}
            scores['chars'] = ts.char_count(doc)
            scores['words'] = ts.lexicon_count(doc)
            scores['sents'] = ts.sentence_count(doc)
            #scores['syllables'] = ts.syllable_count(doc)
            scores['avg_sent_length'] = ts.avg_sentence_length(doc)
            scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc)
            scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc)
            scores['flesch'] = ts.flesch_reading_ease(doc)
            #scores['smog'] = ts.smog_index(doc)
            #scores['coleman_liau'] = ts.coleman_liau_index(doc)
            scores['automated_readability'] = ts.automated_readability_index(
                doc)
            #scores['linsear'] = ts.linsear_write_formula(doc)
            #scores['difficult_words'] = ts.difficult_words(doc)
            scores['dale_chall'] = ts.dale_chall_readability_score(doc)
            #scores['gunning_fog'] = ts.gunning_fog(doc)
            scores['lix'] = ts.lix(doc)
            docs_scores.append(scores)

        return docs_scores
示例#7
0
def main():
    csv_file2 = open(sys.argv[2], 'w', encoding="utf8")
    writer = csv.writer(csv_file2, delimiter=',')
    doc_id = 1
    writer.writerow(["ID", "URL", "text", "impact-score", "readability", "grade-level", "smog-index", "total-words", "total-sentences"])
    with open(sys.argv[1], 'r',  encoding="utf8", errors='ignore') as csv_file1:
        reader = csv.reader(csv_file1)
        # Skip the first line with headers
        next(reader)
        for row in reader:
            impact = str(row[0])
            url = str(row[1])
            text = str(row[2])
            read_ease = textstat.flesch_reading_ease(text)
            grade = textstat.flesch_kincaid_grade(text)
            smog = textstat.smog_index(text)
            words = textstat.lexicon_count(text)
            sentences = textstat.sentence_count(text)
            # Uncomment this if we want summary and key words
            # summary = summarize(text, ratio=0.3)
            # key_words = keywords(text, ratio=0.3)

            writer.writerow([doc_id]+[url]+[text]+[impact]+[read_ease]+[grade]+[smog]+[words]+[sentences])
            doc_id = doc_id+1
    csv_file1.close()
    csv_file2.close()

    print('Summary statistics complete!')
示例#8
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        "statistics": {
            "syllables": textstat.syllable_count(text),
            "words": textstat.lexicon_count(text),
            "characters": textstat.char_count(text),
            "polysyllables": textstat.polysyllabcount(text),
            "average letter per word": textstat.avg_letter_per_word(text),
            "average sentence length": textstat.avg_sentence_length(text),
            "average sentence per word": textstat.avg_sentence_per_word(text),
            "sentences": textstat.sentence_count(text),
        },
        "difficulty": {
            "flesch reading ease": textstat.flesch_reading_ease(text),
            "smog index": textstat.smog_index(text),
            "flesch kincaid grade": textstat.flesch_kincaid_grade(text),
            "coleman liau index": textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            "gunning fog": textstat.gunning_fog(text),
        },
        "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity},
    }

    return main
示例#9
0
    def readability_analysis(self, text):
        words = text.split()
        wrd_dic = {}
        for wrd in words:
            wrd = "".join(a for a in wrd if a not in punctuation)
            wrd_dic[wrd] = textstat.syllable_count(wrd)
        wrd_dic = [b for b in wrd_dic if wrd_dic[b] >= 5]

        flesch_reading_ease = textstat.flesch_reading_ease(text)

        if flesch_reading_ease > 100:
            flesch_reading_ease = 100
        elif flesch_reading_ease < 0:
            flesch_reading_ease = 0

        syllable_count = textstat.syllable_count(text)
        avg_syllables_per_word = textstat.avg_syllables_per_word(text)
        avg_letter_per_word = textstat.avg_letter_per_word(text)

        readability = {
            "flesch_reading_ease": flesch_reading_ease,
            "avg_syllables_per_word": avg_syllables_per_word,
            "syllable_count": syllable_count,
            "avg_letter_per_word": avg_letter_per_word,
        }

        grade, score = self.readability_grade(readability)
        readability['grade'] = grade
        readability['score'] = score
        readability['difficult_words'] = wrd_dic
        return readability
def _get_reading_stats(no_code_text):
    """
    Returns reading level information
    :param no_code_text: String to analyse
    :return: list of details
    """
    group_by = 'Reading Level Analysis '
    results = []
    results.append(TextFeature('Flesch Reading Ease', textstat.flesch_reading_ease(no_code_text), group_by))        # higher is better, scale 0 to 100
    results.append(TextFeature('Flesch-Kincaid Grade Level', textstat.flesch_kincaid_grade(no_code_text), group_by))
    try:
        results.append(TextFeature('The Fog Scale (Gunning FOG formula)', textstat.gunning_fog(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('The Fog Scale (Gunning FOG formula)', "Undetermined", group_by))
    try:
        results.append(TextFeature('The SMOG Index', textstat.smog_index(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('The SMOG Index', "Undetermined", group_by))
    results.append(TextFeature('Automated Readability Index', textstat.automated_readability_index(no_code_text), group_by))
    results.append(TextFeature('The Coleman-Liau Index', textstat.coleman_liau_index(no_code_text), group_by))
    try:
        results.append(TextFeature('Linsear Write Formula', textstat.linsear_write_formula(no_code_text), group_by))
    except IndexError:
        results.append(TextFeature('Linsear Write Formula', "Undetermined", group_by))
    try:
        results.append(TextFeature('Dale Chall Readability Score', textstat.dale_chall_readability_score(no_code_text), group_by))
    except IndexError:  # Not sure why, but this test throws this error sometimes
        results.append(TextFeature('Dale Chall Readability Score', "Undetermined", group_by))

    try:
        results.append(TextFeature('Readability Consensus', textstat.readability_consensus(no_code_text), group_by))
    except (TypeError, IndexError):
        results.append(TextFeature('Readability Consensus', "Undetermined; One of the tests above failed.", group_by))
    return results
示例#11
0
def readability(text):
    print("Readability\n=================================\n\n")
    print("Flesch Reading Ease\n________________________\n\n")
    print str(textstat.flesch_reading_ease(text)) + "\n"
    print("Smog Index\n________________________\n\n")
    print str(textstat.smog_index(text)) + "\n"
    print("Flesch Kincaid Grade\n________________________\n\n")
    print str(textstat.flesch_kincaid_grade(text)) + "\n"
    print("Coleman Liau Index\n________________________\n\n")
    print str(textstat.coleman_liau_index(text)) + "\n"
    print("ARI\n________________________\n\n")
    print str(textstat.automated_readability_index(text)) + "\n"
    print("Dale Chall\n________________________\n\n")
    print str(textstat.dale_chall_readability_score(text)) + "\n"
    print("Difficult Words\n________________________\n\n")
    print str(textstat.difficult_words(text)) + "\n"
    print("Linsear Write Formula\n________________________\n\n")
    print str(textstat.linsear_write_formula(text)) + "\n"
    print("Gunning Fog\n________________________\n\n")
    print str(textstat.gunning_fog(text)) + "\n"
    print "Compiled Score\n_____________________________\n\n"
    print str(textstat.text_standard(text)) + "\n"


    return len(adjectives)
示例#12
0
def main():
    """
    Evaluate and print Readability scores
    """

    if len(sys.argv) > 1:
        inf = open(sys.argv[1], 'r')
    else:
        sys.stderr.write('Error: specify input file.\n')
        sys.exit()

    text = inf.read()
    inf.close()

    lexcount = textstat.lexicon_count(text)

    sys.stdout.write('Lexicon count: {0:d}\n'.format(lexcount))
    
    # reading time in minutes
    # assumes 180 WPM plus some offset
    tread = (lexcount + 250) / 180.

    sys.stdout.write('Estimating reading time: {0:1.1f} minutes.\n'.format(tread))

    ease = textstat.flesch_reading_ease(text)
    grade = textstat.flesch_kincaid_grade(text)

    sys.stdout.write('Flesch reading ease score: {0:1.1f}\n'.format(ease))
    sys.stdout.write('Flesch-Kincaid grade: {0:1.1f}\n'.format(grade))
示例#13
0
文件: ReadSpeed.py 项目: d4444x/readr
def text_analytics(text):
    if textstat.sentence_count(text) != 0:
        lexicon = textstat.lexicon_count(text) #word count
        sent = textstat.sentence_count(text) #sentence count
        syll = textstat.syllable_count(text) #syllable count
        flesch = textstat.flesch_reading_ease(text) #flesch score
        smog = textstat.smog_index(text) #SMOG index
        fog = textstat.gunning_fog(text) #FOG index
        dale = textstat.dale_chall_readability_score(text) #grade level
        ari = textstat.automated_readability_index(text) #grade level
        cl = textstat.coleman_liau_index(text) #grade level

        flesch1 = lexicon*flesch
        flesch2 = sent*flesch
        flesch3 = syll*flesch
        smog1 = lexicon*smog
        smog2 = sent*smog
        smog3 = syll*smog
        fog1 = lexicon*fog
        fog2 = sent*fog
        fog3 = syll*fog
        dale1 = lexicon*dale
        dale2 = sent*dale
        dale3=syll*dale
        ari1 = lexicon*ari
        ari2 = sent*ari
        ari3 = syll*ari
        cl1 = lexicon*cl
        cl2 = sent*cl
        cl3 = syll*cl
        x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1,                 smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3]
    return(x)
示例#14
0
def save_page(request):
    #Smash through the request and get the url, desc & folder
    url = request.GET.get('url')
    desc = request.GET.get('desc')
    title = request.GET.get('title')
    folder_name = request.GET.get('folder_name')

    #Score the page using APIs

    r = requests.get(url)  #Open the url, read the contents then score them. Seems to be slowing down the app quite a bit.
    myfile = BeautifulSoup(r.text,"html.parser").text
    blob = TextBlob(myfile)
    scr_polarity = "{:.2f}".format( blob.sentiment.polarity)
    scr_subjectivity = "{:.2f}".format( blob.sentiment.subjectivity)
    scr_readability ="{:.2f}".format(textstat.flesch_reading_ease(myfile))

    #Save page
    user_profile = UserProfile.objects.all().get(user=request.user)
    folder = Folder.objects.all().get(name=folder_name,user=user_profile)
    P = Page.objects.get_or_create(title=title,
    url=url,summary=desc,
    readability_score=scr_readability,
    objectivity_score=scr_subjectivity,
    sentimentality_score=scr_polarity,
    folder=folder)

    #Return success/fail
    return HttpResponse('Was a success')
示例#15
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        'statistics': {
            'syllables': textstat.syllable_count(text),
            'words': textstat.lexicon_count(text),
            'characters': textstat.char_count(text),
            'polysyllables': textstat.polysyllabcount(text),
            'average letter per word': textstat.avg_letter_per_word(text),
            'average sentence length': textstat.avg_sentence_length(text),
            'average sentence per word': textstat.avg_sentence_per_word(text),
            'sentences': textstat.sentence_count(text)
        },
        'difficulty': {
            'flesch reading ease': textstat.flesch_reading_ease(text),
            'smog index': textstat.smog_index(text),
            'flesch kincaid grade': textstat.flesch_kincaid_grade(text),
            'coleman liau index': textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            'gunning fog': textstat.gunning_fog(text)
        },
        'sentiments': {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
    }

    return main
示例#16
0
def getFeatures(files):
    allFileData = []
    allScores = []
    allFres = []
    df = files
    rowNum = df.shape[0]
    for i in range(rowNum):
        FileData = []
        review = (df.iloc[i].reviewText).strip().split()
        summary = (df.iloc[i].summary).strip().split()
        row_data = np.append(review, summary)

        score = float(df.iloc[i].overall)
        fres = textstat.flesch_reading_ease(str(row_data))
        lor = lengthOfReview(str(row_data))
        sc = sentenceCount(str(row_data))
        cc = charCount(str(row_data))
        acc = allCapCount(str(row_data))
        qc = questionCount(str(row_data))
        review = [element.lower() for element in review]

        FileData.append(lor)
        FileData.append(sc)
        FileData.append(cc)
        FileData.append(acc)
        FileData.append(qc)
        FileData.append(score)
        allFileData.append(FileData)
        allScores.append(score)
        allFres.append(fres)

    return allFileData, allScores, allFres
示例#17
0
文件: app.py 项目: smanjit/metrics
def compareContents():
	if request.method == "POST":
	    line = request.form['poem']
	    poem1 = request.form['poem1']
		#---------Metrics comparison logic goes here. keep them in session attributes-----------------------#

	    session['line'] = line	    
        #print("i am in row : ",row)
        #print "Tagline :", line
	    #print("no of words= ",len(line.split()))
	    #line1 = line.lstrip('0123456789.- ,')
	    #print "flesch_reading_ease = ",textstat.flesch_reading_ease(line)
	    fre = textstat.flesch_reading_ease(line)
	    session['fre'] = fre
	    #print "smog_index = ",textstat.smog_index(line)
	    smog = textstat.smog_index(line)
	    session['smog'] = smog
	    #print "flesch_kincaid_grade = ",textstat.flesch_kincaid_grade(line)
	    fkg = textstat.flesch_kincaid_grade(line)
	    session['fkg'] = fkg
	    #print "dale_chall_readability_score = ", textstat.dale_chall_readability_score(line)
	    dcr = textstat.dale_chall_readability_score(line)
	    session['dcr'] = dcr
	    #print "gunning_fog = ",textstat.gunning_fog(line)
	    gf = textstat.gunning_fog(line)
	    session['gf'] = gf
	    metrics = True
	    return render_template('compareContents.html',metrics=metrics, line=line, fre=fre, smog=smog, fkg=fkg, dcr=dcr,gf=gf)
	return render_template('compareContents.html')
def other_features(tweet):
    ##SENTIMENT
    sentiment = VS(tweet)
    ##READABILITY
    #See https://pypi.python.org/pypi/textstat/
    flesch = round(textstat.flesch_reading_ease(tweet), 3)
    flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet), 3)
    gunning_fog = round(textstat.gunning_fog(tweet), 3)
    ##TEXT-BASED
    length = len(tweet)
    num_terms = len(tweet.split())
    ##TWITTER SPECIFIC TEXT FEATURES
    hashtag_count = tweet.count("#")
    mention_count = tweet.count("@")
    url_count = tweet.count("http")
    retweet = 0
    if tweet.lower().startswith("rt") is True:
        retweet = 1
    #Checking if RT is in the tweet
    words = tweet.lower().split()
    if "rt" in words or "#rt" in words:
        retweet = 1
    features = [
        sentiment['compound'], flesch, flesch_kincaid, gunning_fog, length,
        num_terms, hashtag_count, mention_count, url_count, retweet
    ]
    return features
示例#19
0
def getTopicsDistributionWithinTheText(path, chunk_length=300):

    global_scores = topicsFromTokens(tokenize(loadText(path)))
    text = loadText(path)
    words = text.split()
    average = textstat.flesch_reading_ease(text)
    scores = dict()
    for i in sorted(global_scores, key=lambda tup: tup[1], reverse=True):
        if i[1] > min_score:
            scores[i[0]] = []
    x = []
    y = []
    cur = 0
    i = 1
    while (cur < len(words)):
        sub = words[cur:cur + chunk_length]
        sub.append('.')
        sub_text = ' '.join(sub)
        cur += chunk_length

        bow = lda.id2word.doc2bow(raw_tokenize(sub_text))
        score = lda.get_document_topics(bow)
        for s in score:
            if s[0] in scores.keys():
                scores[s[0]].append(s[1])

        for s in scores:
            if len(scores[s]) < i:
                scores[s].append(0)
        i += 1

    return scores, global_scores
def other_features(tweet):
    ##SENTIMENT
    sentiment = VS(tweet)
    ##READABILITY
    #See https://pypi.python.org/pypi/textstat/
    flesch = round(textstat.flesch_reading_ease(tweet),3)
    flesch_kincaid = round(textstat.flesch_kincaid_grade(tweet),3)
    gunning_fog = round(textstat.gunning_fog(tweet),3)
    ##TEXT-BASED
    length = len(tweet)
    num_terms = len(tweet.split())
    ##TWITTER SPECIFIC TEXT FEATURES
    hashtag_count = tweet.count("#")
    mention_count = tweet.count("@")
    url_count = tweet.count("http")
    retweet = 0
    if tweet.lower().startswith("rt") is True:
        retweet = 1
    #Checking if RT is in the tweet
    words = tweet.lower().split()
    if "rt" in words or "#rt" in words:
        retweet = 1
    features = [sentiment['compound'],flesch, flesch_kincaid,
                gunning_fog, length, num_terms,
                hashtag_count, mention_count,
                url_count, retweet]
    return features
示例#21
0
def f():
    print("hello")
    book = xlwt.Workbook()
    worksheet = book.add_sheet('ReadabilityScore')
    worksheet.write(0, 0, "Gen_sent")
    worksheet.write(0, 1, "flesch_reading_ease")
    worksheet.write(0, 2, "flesch_kincaid_grade")
    worksheet.write(0, 3, "dale_chall_readability_score")
    worksheet.write(0, 4, "gunning_fog")

    f = open('abc.txt')  #, encoding='utf-8')
    row = 1
    for line in iter(f):
        #print("i am in row : ",row)
        #print "Tagline :", line
        worksheet.write(row, 0, line)
        #print("no of words= ",len(line.split()))
        #line1 = line.lstrip('0123456789.- ,')
        #print "flesch_reading_ease = ",textstat.flesch_reading_ease(line)
        fre = textstat.flesch_reading_ease(line)
        worksheet.write(row, 1, fre)
        #print "smog_index = ",textstat.smog_index(line)
        smog = textstat.smog_index(line)
        #print "flesch_kincaid_grade = ",textstat.flesch_kincaid_grade(line)
        fkg = textstat.flesch_kincaid_grade(line)
        worksheet.write(row, 2, fkg)
        #print "dale_chall_readability_score = ", textstat.dale_chall_readability_score(line)
        dcr = textstat.dale_chall_readability_score(line)
        worksheet.write(row, 3, dcr)
        #print "gunning_fog = ",textstat.gunning_fog(line)
        gf = textstat.gunning_fog(line)
        worksheet.write(row, 4, gf)
        row += 1
    book.save('Readability_Scores.xls')
示例#22
0
def process(url):
    html = urllib.urlopen(url).read()
    soup = BeautifulSoup(html, "html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)

    blob = TextBlob(text)
    sent = blob.sentiment
    subj = 100 - int((sent.subjectivity)*100)                     #the less subjective, the better
    polar = 100 - int((sent.polarity)*100)                        #the less polar, the better
    readability = int(textstat.flesch_reading_ease(text))

    return subj, polar, readability
示例#23
0
 def do_text_stats(self, text):
     ### Syllable Count
     syllable_count = textstat.syllable_count(text)
     ### Lexicon Count
     lexicon_count = textstat.lexicon_count(text, True)
     ### Sentence Count
     sentence_count = textstat.sentence_count(text)
     ### The Flesch Reading Ease formula
     try:
         flesch_reading_ease = textstat.flesch_reading_ease(text)
     except TypeError as e:
         flesch_reading_ease = None
     #* 90-100 : Very Easy
     #* 80-89 : Easy
     #* 70-79 : Fairly Easy
     #* 60-69 : Standard
     #* 50-59 : Fairly Difficult
     #* 30-49 : Difficult
     #* 0-29 : Very Confusing
     ### The The Flesch-Kincaid Grade Level
     try:
         flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     except TypeError as e:
         flesch_kincaid_grade = None
     ## The Fog Scale (Gunning FOG Formula)
     gunning_fog = textstat.gunning_fog(text)
     ### The SMOG Index
     smog_index = textstat.smog_index(text)
     ### Automated Readability Index
     automated_readability_index = textstat.automated_readability_index(
         text)
     ### The Coleman-Liau Index
     try:
         coleman_liau_index = textstat.coleman_liau_index(text)
     except TypeError as e:
         coleman_liau_index = None
     ### Linsear Write Formula
     linsear_write_formula = textstat.linsear_write_formula(text)
     ### Dale-Chall Readability Score
     dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     ### Readability Consensus based upon all the above tests
     try:
         text_standard = textstat.text_standard(text)
     except TypeError as e:
         text_standard = None
     return {
         "syllable_count": syllable_count,
         "lexicon_count": lexicon_count,
         "sentence_count": sentence_count,
         "flesch_reading_ease": flesch_reading_ease,
         "flesch_kincaid_grade": flesch_kincaid_grade,
         "gunning_fog": gunning_fog,
         "smog_index": smog_index,
         "automated_readability_index": automated_readability_index,
         "coleman_liau_index": coleman_liau_index,
         "linsear_write_formula": linsear_write_formula,
         "dale_chall_readability_score": dale_chall_readability_score,
         "text_standard": text_standard
     }
示例#24
0
def get_readability(df2):
    df = df2.copy()
    text_feats = df.select_dtypes(include=['object']).columns.values
    for i, col in enumerate(text_feats):
        df['flesch_reading_ease{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_reading_ease(x))
        df['smog_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.smog_index(x))
        df['flesch_kincaid_grade{}'.format(i)] = df[col].apply(
            lambda x: textstat.flesch_kincaid_grade(x))
        df['coleman_liau_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.coleman_liau_index(x))
        df['automated_readability_index{}'.format(i)] = df[col].apply(
            lambda x: textstat.automated_readability_index(x))
        df['dale_chall_readability_score{}'.format(i)] = df[col].apply(
            lambda x: textstat.dale_chall_readability_score(x))
        df['difficult_words{}'.format(i)] = df[col].apply(
            lambda x: textstat.difficult_words(x))
        df['linsear_write_formula{}'.format(i)] = df[col].apply(
            lambda x: textstat.linsear_write_formula(x))
        df['gunning_fog{}'.format(i)] = df[col].apply(
            lambda x: textstat.gunning_fog(x))
        df['text_standard{}'.format(i)] = df[col].apply(
            lambda x: textstat.text_standard(x))
    return df
def flesch_reading_ease_score():
    tokenizer = RegexpTokenizer(r'\w+')
    final_flesch_reading_ease_score = []
    for index, row in df.iterrows():
        valid_words = []
        body_only = re.sub('<code>[^>]+</code>', '', row['Body'])
        soup = BeautifulSoup(body_only, "lxml")
        word_tokens = tokenizer.tokenize(soup.text)
        for word in word_tokens:
            if not_punctuation(word):
                valid_words.append(word)
        word_count = len(valid_words)
        tag_removed_text = soup.text
        tag_removed_text = tag_removed_text.replace("\n", "")

        if word_count != 0:
            flesch_reading_ease_score = textstat.flesch_reading_ease(
                tag_removed_text)
        else:
            flesch_reading_ease_score = 0
        print "flesch_reading_ease_score of ", index, " - ", flesch_reading_ease_score
        final_flesch_reading_ease_score.append(flesch_reading_ease_score)

    df['BodyFleschReadingEaseLevel'] = final_flesch_reading_ease_score
    df.to_csv("combined.csv")
示例#26
0
def flesch_score(text):
    try:
        if text == "":
            return 0
        return textstat.flesch_reading_ease(text)
    except:
        return 0
示例#27
0
def print_readability_metrics(text, file_name):
    print(file_name, " readability metrics")
    print("flesch reading ease: ", textstat.flesch_reading_ease(text))
    print("dale chall readability: ",
          textstat.dale_chall_readability_score(text))
    print("smog index: ", textstat.smog_index(text))
    print('------------------------------------------------')
示例#28
0
def get_text_features(article_contents: str) -> dict:
    """
    Takes an article's contents and analyzes its complexity using numerous reading scores and methods. Also calculates
    other factors such as the number of typos.

    @param article_contents, a string which contains the contents of an article
    @return language_analysis_dict, a dictionary which contains
    """
    tool = language_check.LanguageTool('en-US')
    language_analysis_dict = {
        "flesch_reading":
        textstat.flesch_reading_ease(article_contents),
        "flesch_kincaid":
        textstat.flesch_kincaid_grade(article_contents),
        "coleman_liau":
        textstat.coleman_liau_index(article_contents),
        "typos_to_words":
        len(tool.check(article_contents)) /
        textstat.lexicon_count(article_contents),
        "percent_difficult_words":
        textstat.difficult_words(article_contents) /
        textstat.lexicon_count(article_contents),
    }

    return language_analysis_dict
示例#29
0
def calculate_statistics(lyrics):
    """
    Calculates statistics based on the text_raw of the lyrics.
    :return: Annotated lyrics containing information about the songs
    """
    logging.info("Calculating Statistics")
    from textstat.textstat import textstat
    for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)):
        try:
            song["num_syllables"] = textstat.syllable_count(song["text_raw"])
            song["num_words"] = textstat.lexicon_count(song["text_raw"])
            song["num_sentences"] = textstat.sentence_count(song["text_raw"])
            song["flesch_score"] = textstat.flesch_reading_ease(
                song["text_raw"])
            song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade(
                song["text_raw"])
            song["fog_score"] = textstat.gunning_fog(song["text_raw"])
            song[
                "num_difficult_words"] = textstat.dale_chall_readability_score(
                    song["text_raw"])
        except Exception as e:
            logging.error(
                "Something bad happened in the current song ! Skipping it... \n{}"
                .format(song))
            logging.exception(e)
    return lyrics
示例#30
0
def getTopicsDistributionWithinTheText(path, chunk_length = 300 ):
    
	global_scores = topicsFromTokens(tokenize(loadText(path)))
	text = loadText(path)
	words = text.split()
	average = textstat.flesch_reading_ease(text)
	scores = dict()
	for i in sorted(global_scores, key=lambda tup: tup[1], reverse = True):
	    if i[1] > min_score:
	        scores[i[0]] = []
	x = []
	y = []
	cur = 0
	i = 1
	while ( cur < len(words) ):
	    sub = words[cur:cur+chunk_length]
	    sub.append('.')
	    sub_text = ' '.join(sub)
	    cur += chunk_length
	    
	    bow = lda.id2word.doc2bow(raw_tokenize(sub_text))
	    score = lda.get_document_topics(bow)
	    for s in score:
	        if s[0] in scores.keys():
	            scores[s[0]].append(s[1])
	            
	    for s in scores:
	        if len(scores[s]) < i:
	            scores[s].append(0)
	    i += 1
	    
	    
	return scores, global_scores
示例#31
0
def analyse_plain_text(test_data):
    text_stats = TextStats()

    # Do some simple analysis.
    from textblob import TextBlob
    zen = TextBlob(test_data)
    text_stats.word_count = len(zen.words)
    text_stats.sentence_count = len(zen.sentences)
    text_stats.polarity = zen.sentiment.polarity
    text_stats.subjectivity = zen.sentiment.subjectivity

    # Easy to read, this?
    from textstat.textstat import textstat
    text_stats.flesch_reading_ease = textstat.flesch_reading_ease(test_data)

    # Words per sentence count.
    from textstat.textstat import textstat
    text_stats.word_per_sentence_count = (
        textstat.lexicon_count(test_data, False) /
        textstat.sentence_count(test_data))

    # Convert all to lower.
    test_data = test_data.lower()

    # Tokenise.
    from nltk.tokenize import word_tokenize
    words = word_tokenize(test_data)

    # Tokenise stemmed text.
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    test_data_stemmed = ''
    for w in words:
        test_data_stemmed = test_data_stemmed + ' ' + ps.stem(w)
    stemmed_words = word_tokenize(test_data_stemmed)

    # Remove non-words.
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in stemmed_words if nonPunct.match(w)]

    # Remove stopwords:
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    extra_stopwords = set([
        'that', '\'s', 'wa', 'thi', 'like', 'n\'t', 'would', 'ha', 'us', 'get'
    ])
    filtered = [
        w for w in filtered if w not in stopwords and w not in extra_stopwords
    ]

    # How many unique words?
    from collections import Counter
    counts = Counter(filtered)
    text_stats.unique_word_count = len(counts)

    # Words sorted by most common.
    text_stats.counts = counts

    return text_stats
示例#32
0
def calculateScores(text):
    text = smart_bytes(text,encoding="utf-8",strings_only=False,errors="replace")
    temp = TextBlob(text)
    toReturn = {}
    toReturn["readability_score"] = textstat.flesch_reading_ease(text)
    toReturn["subjectivity_score"] = temp.sentiment.subjectivity * 100
    toReturn["sentiment_score"] = (temp.sentiment.polarity + 1) * 50
    return toReturn
示例#33
0
def flesch_reading_ease(text):
    """
    :type text: Text
    :param text: The text to be analysed
    :rtype float
    :returns Flesch Reading Ease Score
    """
    return textstat.flesch_reading_ease(text.text)
示例#34
0
def  get_sensitivity_rating(text):
    blob = TextBlob(text)
    d = { "sentimentality":blob.subjectivity * 100,
        "readability":textstat.flesch_reading_ease(text)}
    # subjectivity is a float between 0 and 1, multiplied it by 100 so it's percent
    # The overall sensitivity of teh article is the readability minus the subjectivity
    d["sensitivity"] = (d["readability"] - d["sentimentality"])
    return d
示例#35
0
def textConfidence(fname):
    with PyTessBaseAPI() as api:
        #for image in images:
        api.SetImageFile(fname)
        text = api.GetUTF8Text()
        #print api.AllWordConfidences()
        print textstat.flesch_kincaid_grade(text)

        print textstat.flesch_reading_ease(text)

        print("90-100 : Very Easy")
        print("80-89 : Easy")
        print("70-79 : Fairly Easy")
        print("60-69 : Standard")
        print("50-59 : Fairly Difficult")
        print("30-49 : Difficult")
        print("0-29 : Very Confusing")
示例#36
0
 def get_readme_flesch_reading_ease(self):
     """Calculates the Flesch Reading Ease level of the
     repository's readme, with 100 being easiest to
     read and 0 being hardest, using textstat.
     """
     if self.readme:
         return textstat.flesch_reading_ease(self.readme)
     else:
         return None
示例#37
0
 def get_reading_score(self):
     """
     Takes the page copy and return the flesch reading age of the page
     Also checking whether the page text has more than 0 characters as it will return an error
     """
     if len(self.page_text) > 10:
         self.reading_score = textstat.flesch_reading_ease(self.page_text)
     else:
         self.reading_score = 0
 def get_readme_flesch_reading_ease(self):
     """Calculates the Flesch Reading Ease level of the
     repository's readme, with 100 being easiest to
     read and 0 being hardest, using textstat.
     """
     if self.readme:
         return textstat.flesch_reading_ease(self.readme)
     else:
         return None
示例#39
0
def _get_readability_score(text):
    try:
        readability_score = int(textstat.flesch_reading_ease(text))
        #concerts (-100,100) to (0,100)
        readability_score = 50 + readability_score / 2
        return readability_score
    except:
        print '_get_readability_score'
        return 0
示例#40
0
def trygetreadingease(extract):
    # Avoid spamming warnings for the common case of zero-length extracts:
    if len(extract.strip()) == 0:
        return None
    try: 
        return textstat.flesch_reading_ease(extract)
    except Exception as e:
        logging.warning("Can't get FRES for %s: %s" % (extract, e))
        return None
def run_query(search_terms):
    # Construct the latter part of our request's URL.
    # Sets the format of the response to JSON and sets other properties.
    search_url = "http://healthfinder.gov/developer/Search.json?api_key="+ HEALTHFINDER_API_KEY + "&keyword="+search_terms

    # Create our results list which we'll populate.
    results = []

    try:
        # Connect to the server and read the response generated.
        response = urllib2.urlopen(search_url).read()
        
        # Convert the string response to a Python dictionary object.
        json_response = json.loads(response)
        if 'Tools' not in json_response["Result"]:
            return []
        # Loop through each page returned, populating out results list.
        for result in json_response["Result"]["Tools"]:
            try:
              blob = TextBlob(result['Contents'])
              for sentence in blob.sentences:
                  polarity_score = sentence.sentiment.polarity
                  subjectivity_score = sentence.sentiment.subjectivity
    
              url = ""
              
              if type(result['MoreInfo'])== list:
                  url = result['MoreInfo'][0]['Url']
              else:
                  url = result['MoreInfo']['Url']

              if url.endswith('/'):
                  url = url[:-1]

              adder = ""
              if len(result['Contents']) > 400:
                adder = "..."

              results.append({
                'title': result['Title'],
                'link': url,
                'summary': result['Contents'][:400] + adder,
                'flesch_score': '{0:.2f}'.format(textstat.flesch_reading_ease(result['Contents'])),
                'polarity_score': '{0:.2f}'.format(polarity_score),
                'subjectivity_score': '{0:.2f}'.format(subjectivity_score),
                'source':'HealthFinder'})
            except:
                continue


    # Catch a URLError exception - something went wrong when connecting!
    except urllib2.URLError as e:
        print "Error when querying the HealthFinder API: ", e

    # Return the list of results to the calling function.
    return results
示例#42
0
def vecify(v):
    return [ts.flesch_reading_ease(v),
    # ts.smog_index(v),
    ts.flesch_kincaid_grade(v),
    ts.coleman_liau_index(v),
    ts.automated_readability_index(v),
    ts.dale_chall_readability_score(v),
    ts.difficult_words(v),
    ts.linsear_write_formula(v),
    ts.gunning_fog(v)]
示例#43
0
 def calc_flesch_level(self):
         #206.835 - (total words/total sentences) * 1.015 - (total syllables / total words) * 84.6
         #print(sum([len(word)/3. if word not in arpabet else len(arpabet[word][0]) for word in self.word_tok[0] if word not in punctuation_set]))
         #print(self.sent_len[0], self.word_count[0])
         #input()
         #self.flesch_level = [sl * 0.39 + 11.8 * np.mean(
         #    [len(word)/3. if word not in arpabet else len(arpabet[word][0])
         #     for word in text if word not in punctuation_set]) - 15.59
         #                     for text, sl, wc in zip(self.word_tok, self.sent_len, self.word_count)]
         self.flesch_level = [textstat.flesch_reading_ease(' '.join(text)) for text in self.sent_tok]
示例#44
0
def fleschScore(data):

    # For some reason the textstat works for sentences or strings
    # longer than 3 words and if its less it gives that error
    # but the documents are quite large in text so it wont be a problem
    # but for safety reasons we catch the error and give 0
    try:
        readingScore = textstat.flesch_reading_ease(data)
    except TypeError:
        readingScore = 0.0
    return readingScore
def calculate_stats(content):

    try:
        testimonial = TextBlob(content)
        polarity = '%.3f'%(testimonial.sentiment.polarity)
        subjectivity = '%.3f'%(testimonial.sentiment.subjectivity)

        flesh_score = '%.3f'%(textstat.flesch_reading_ease(content))

        return {'polarity': polarity, 'subjectivity': subjectivity, 'flesh_score': flesh_score}
    except:
        return {'polarity': 0, 'subjectivity': 0, 'flesh_score': 0}
def textstat_analysis(profile_text):
    fre = textstat.flesch_reading_ease(profile_text)
    smog = textstat.smog_index(profile_text)
    fkg = textstat.flesch_kincaid_grade(profile_text)
    coleman = textstat.coleman_liau_index(profile_text)
    ari = textstat.automated_readability_index(profile_text)
    dale = textstat.dale_chall_readability_score(profile_text)
    dw = textstat.difficult_words(profile_text)
    lwf = textstat.linsear_write_formula(profile_text)
    gf = textstat.gunning_fog(profile_text)
    rc = textstat.readability_consensus(profile_text)
    word_count = textstat.lexicon_count(profile_text)
    return (fre, smog, fkg, coleman, ari, dale, dw, lwf, gf, rc, word_count)
示例#47
0
def displayResults( path ):
	print "stats"
	text = loadText(path)
	raw_tokens = raw_tokenize(text)
	print "number of words %s" %count_words(text)
	print "number of sentences %s" %textstat.sentence_count(text)
	print "uniques words: %s" %len(set(raw_tokenize(text)))
	print "Difficulty %s / 100 " %(100 - textstat.flesch_reading_ease(text))
	print "Average sentiment %s (negative: 0, neutral: 5, positive: 10)"%calculateSentiment(raw_tokens)
	print
	print "topic distribution"
	displayTopicsDistributionWithinTheText(path, 300, pie = False)
	print "difficulty over the text "
	complexityAlongtheText( path, 300)
def age_feature(text, feature_vect):
    """
    Extract age features
    :param text:
    :param feature_vect: contains a bag of words
    :return:a dictionary which contains the feature and its computed value
    """
    tokens = word_tokenize(text.lower())

    features = {}
    for word in feature_vect:
        features['contains(%s)' % word] = (word in set(tokens))
    return dict(features, **dict({'FRE': textstat.flesch_reading_ease(text),
                                  'FKGL': textstat.flesch_kincaid_grade(text)}))
 def __load_text(self):
     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
     with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding = 'utf8', errors = 'ignore') as f:
         data = f.read()
     self.flesch_reading_ease = textstat.flesch_reading_ease(data)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data)
     sentences = tokenizer.tokenize(data)
     self.n_sentences = textstat.sentence_count(data)
     self.avg_sentence_length = textstat.lexicon_count(data, True) * 1. / self.n_sentences
     self.avg_word_length = np.mean([len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english')])
     print 'Parse ', len(sentences), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length
     self.sentences = sentences
     self.tokens = []
     [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
def run_healthfinder_query(search_terms, read_min,
                                         read_max,
                                         pol_min,
                                         pol_max,
                                         sub_min,
                                         sub_max):
    root_url = 'http://healthfinder.gov/developer/'
    search_type = 'Search.json'
    
    query = urllib2.quote(search_terms)

    
    search_url = "{0}{1}?api_key={2}&keyword={3}".format(
        root_url,
        search_type,
        HEALTHFINDER_API_KEY,
        query
    )
    
    results = []
    
    try:
        response = urllib2.urlopen(search_url).read()

        json_response = json.loads(response)
        
        
        for result in json_response["Result"]["Topics"]:
            print "here"
            summary = result["Sections"][0]["Description"]
            blobSummary = TextBlob(summary)
            read = textstat.flesch_reading_ease(summary)
            pola = float("%.2f" % blobSummary.sentiment.polarity)
            subj = float("%.2f" % blobSummary.sentiment.subjectivity)
            if (read_min <= read <= read_max) and (pol_min <= pola <= pol_max) and (subj <= sub_max and subj >= sub_min):
                results.append({
                    'title':result["Title"],
                    'url':result["AccessibleVersion"],
                    'summary':result["Sections"][0]["Description"],
                    'read':read,
                    'pola':pola,
                    'subj':subj,
                    'source':'HealthFinder'
                    })
    except urllib2.URLError as e:
        print "Error when querying the HealthFinder API: ", e
     
    return results
示例#51
0
def run_medline_query(search_terms, read_min,
                                     read_max,
                                     pol_min,
                                     pol_max,
                                     sub_min,
                                     sub_max):
    root_url = 'https://wsearch.nlm.nih.gov/ws/query'
    source = 'healthTopics'
    
    query = urllib.quote(search_terms)
    query = query.replace('%2B','+')
    query = query.replace('%27','%22')
    
    search_url = "{0}?db={1}&term={2}&rettype=brief".format(
        root_url,
        source,
        query)
       
    results = []
       
    try:
        response = urllib2.urlopen(search_url).read()
        response = xmltodict.parse(response)
        
        for result in response['nlmSearchResult']['list']['document']:
            summary = re.sub('\<.*?>','', result['content'][-1]['#text'])
            blobSummary = TextBlob(summary)
            read = textstat.flesch_reading_ease(summary)
            pola = float("%.2f" % blobSummary.sentiment.polarity)
            subj = float("%.2f" % blobSummary.sentiment.subjectivity)
            if (read_min <= read <= read_max) and (pol_min <= pola <= pol_max) and (subj <= sub_max and subj >= sub_min):
                results.append({
                    'title':re.sub('\<.*?\>','', result['content'][0]['#text']),
                    'url':result['@url'],
                    'summary':re.sub('\<.*?\>','', result['content'][-1]['#text']),
                    'read':read,
                    'pola':pola,
                    'subj':subj,
                    'source':'MedLine'
                    })
            

    except urllib2.URLError as e:
        print "Error when querying the MedLine API: ", e
        
    return results
    
def main() :
  for arg in sys.argv[1:]:
    with open(arg) as f:
      text = f.read()

    with open(arg + '.readability.snip','w') as f:
       f.write ("syllable_count : %s\n" % textstat.syllable_count(text))
       f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text))
       f.write ("sentence_count : %s\n" % textstat.sentence_count(text))
       f.write ("difficult_words : %s\n" % textstat.difficult_words(text))
       f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text))
       f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text))
       f.write ("smog_index : %s\n" % textstat.smog_index(text))
       f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text))
       f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text))
       f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text))
       f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
示例#53
0
	def getReadingLevel(subreddit):
		query = '''SELECT body FROM 
		(SELECT body, RAND() AS r1
		FROM [fh-bigquery:reddit_comments.''' + str(year) + ''']
		WHERE subreddit == "''' + subreddit + '''"  
		AND body != "[deleted]"
		AND body != "[removed]"
		AND score > 1
		ORDER BY r1
		LIMIT 1000)
		'''

		bigquery_service = build('bigquery', 'v2', credentials=credentials)
		try:
			query_request = bigquery_service.jobs()
			query_data = {
				'query': query,
				'timeoutMs': 20000
			}

			query_response = query_request.query(
				projectId=bigquery_pid,
				body=query_data).execute()

		except HttpError as err:
			print('Error: {}'.format(err.content))
			raise err

		rows = query_response['rows']

		levels_sum = 0.0
		levels_count = 0
		for i in range(len(rows)):
			text = rows[i]['f'][0]['v']
			text = re.sub('([A-Za-z]+:\/\/[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)|([A-Za-z]+\.[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)', '', text) #url get rid
			text = re.sub('\s\s+', ' ', text)
			if textstat.sentence_count(text) > 0:
				levels_sum += textstat.flesch_reading_ease(text)
				levels_count += 1

		average_level = 0.0
		if levels_count > 0:
			average_level = levels_sum / levels_count
			results[subreddits.index(subreddit)] = [subreddit, 100.0 - average_level]
def main():
    mongo_client = MongoClient('mongodb://*****:*****@'
                               '107.170.215.176:27017')
    reddit_data = mongo_client.reddit_data
    user_data = reddit_data.user_data
    user_reading_level = reddit_data.user_reading_level
    user_comments = reddit_data.user_comments

    user_reading_level.create_index(
        [("username", pymongo.ASCENDING)],
        background=True,
        unique=True,
        dropDups=True
    )

    for user in user_data.find(no_cursor_timeout=True).sort('data.name', 1):
        name = user['data']['name']
        print name
        comment_list = []
        for comment in user_comments.find({'data.author': name}):
            if comment['kind'] == 't1':  # Actually a comment
                comment_text = comment['data']['body']
                comment_list.append(comment_text)

        comment_book = ' '.join(comment_list).strip()
        try:
            if len(comment_book) > 0:
                reading_ease = textstat.flesch_reading_ease(comment_book)
            else:
                reading_ease = 0
        except TypeError:  # I hate textstat
            reading_ease = 0

        reading_level_data = {'username': name,
                              'reading_level': reading_ease}

        try:
            user_reading_level.insert_one(reading_level_data)
        except pymongo.errors.DuplicateKeyError:
            continue
示例#55
0
def healthapif(search):
    healthDict = {}
    health_api = "http://healthfinder.gov/developer/Search.xml?api_key=gnviveyezcuamzei&keyword="

    keywords = " ".join(search)
    search = keywords.replace('"',"%22")
    search = search.replace(' ',"%20")
    health_api += search

    j = requests.get(health_api).content

    root = ET.fromstring(j)

    # get data from Url tags only
    for topics in root.findall('Topics'):
        for topic in topics.findall('Topic'):
            try:
                title = topic.find('Title').text
                url = topic.find('AccessibleVersion').text
                healthDict[url] = [title]
            except AttributeError:
                title = keywords                              #if there is no title

            content = findContentHealth(url)
            if content != "empty":
                # Reading ease score
                score = textstat.flesch_reading_ease(content)
                scoreText = readingEaseScore(score)
                healthDict[url].append(str(score) + " (" + scoreText + ")")
            else:
                healthDict[url].append("-")

            # Sentiment score
            blob = TextBlob(content)
            sentimentPolarity = blob.sentiment.polarity
            sentimentSubjectivity = blob.sentiment.subjectivity
            sentimentScore = "polarity= %.3f (%s), subjectivity= %.3f (%s)" % (sentimentPolarity, polarityScore(sentimentPolarity), sentimentSubjectivity, subjectivityScore(sentimentSubjectivity))
            healthDict[url].append(sentimentScore)
    return healthDict
示例#56
0
def calculate_readability_measures(id):
    """ Count the words in doc and update the document. """
    es = elasticsearch.Elasticsearch()
    source = es.get_source(index='beek', doc_type='page', id=id)
    # count = len(source['content'].split())
    try:
        measures = {
            'flesch': textstat.flesch_reading_ease(source['content']),
            'smog': textstat.smog_index(source['content']),
            'flesch_kincaid': textstat.flesch_kincaid_grade(source['content']),
            'coleman_liau': textstat.coleman_liau_index(source['content']),
            'readability': textstat.automated_readability_index(source['content']),
            'dale_chall': textstat.dale_chall_readability_score(source['content']),
            'difficult_words': textstat.difficult_words(source['content']),
            'linsear_write_formula': textstat.linsear_write_formula(source['content']),
            'gunning_fog': textstat.gunning_fog(source['content']),
            'consensus': textstat.readability_consensus(source['content']),
        }

        es.update(index='beek', doc_type='page', id=id,
                  body={'doc': {'measures': measures}}, refresh=True)
    except Exception as err:
        pass
def analyze(text):
    
    # Automatically reject if no input
    if text.isspace():
        return -1.0
    if text.startswith('http'):
        return -1.0
    
    # Analyze text
    try:
        x = textstat.flesch_reading_ease(text)
    except:
        return -1.0
    
    # Keep outputs valid
    if not isinstance(x, float):
        return -1.0
    if x < 0:
        return -1.0
    if x > 100:
        return 100.0
    
    return x
示例#58
0
def fic2text(ident):
   textsegs = Loader.get_field(data['fics'],ident,'fic') 
   rtags = Loader.get_field(data['base'],ident,'tags')
   rtext = ""

   for line in textsegs:
      line = line.replace(u'\xa0',' ')
      s = re.sub('([.,!?()])', r' \1 ', line)
      s = re.sub('\s{2,}', ' ', line)
      line = line.encode('ascii', 'ignore').decode('ascii')
      rtext += line+" "

   tags = []
   for genre in rtags:
      for el in rtags[genre]:
         tname = el["name"]
         tags.append(tname)

   reading_ease =  textstat.flesch_reading_ease(rtext)
   reading_level = textstat.flesch_kincaid_grade(rtext)
   print(ident,reading_ease,reading_level)
   #tokens = nltk.word_tokenize(rtext)
   return tags,rtext
示例#59
0
def extract_features_sub(text, dialogue = True):
	## aggregate all dialogue, action
	#scenes = format_script(file_name)
	if len(text) > 0:
		try:
			language_complexity = {'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'automated_readability_index': textstat.automated_readability_index(text)}
		except:
			language_complexity = {'flesch_reading_ease': None, 'flesch_kincaid_grade': None, 'automated_readability_index': None}
	else:
		#badD.write(movie_name + "\n")
		language_complexity = {'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0, 'automated_readability_index': 0}
	lexical_diversity = find_lex_d(text)
	sentiment = extract_senti_wordnet(text)
	#print sentiment
	inquirer_features = general_inquirer_features(text)
	final_features = {}
	final_features.update(language_complexity)
	final_features.update(lexical_diversity)
	final_features.update(sentiment)
	final_features.update(inquirer_features)
	curr_keys = [feature for feature in final_features]
	if dialogue:
		new_keys = [feature + "_" + "dialogue" for feature in final_features]
	else:
		new_keys = [feature + "_" + "action" for feature in final_features]
	#print final_features
	"""
	if dialogue: 
		for feature in final_features:
			final_features[feature + "_dialogue"] = final_features.pop(feature)
	else:
		for feature in final_features:
			final_features[feature + "_action"] = final_features.pop(feature)		
	#final_features = language_complexity + lexical_diversity + sentiment + inquirer_features
	"""
	return convert(final_features, dict(zip(curr_keys, new_keys)))