Пример #1
0
def text_analytics(text):
    if textstat.sentence_count(text) != 0:
        lexicon = textstat.lexicon_count(text) #word count
        sent = textstat.sentence_count(text) #sentence count
        syll = textstat.syllable_count(text) #syllable count
        flesch = textstat.flesch_reading_ease(text) #flesch score
        smog = textstat.smog_index(text) #SMOG index
        fog = textstat.gunning_fog(text) #FOG index
        dale = textstat.dale_chall_readability_score(text) #grade level
        ari = textstat.automated_readability_index(text) #grade level
        cl = textstat.coleman_liau_index(text) #grade level

        flesch1 = lexicon*flesch
        flesch2 = sent*flesch
        flesch3 = syll*flesch
        smog1 = lexicon*smog
        smog2 = sent*smog
        smog3 = syll*smog
        fog1 = lexicon*fog
        fog2 = sent*fog
        fog3 = syll*fog
        dale1 = lexicon*dale
        dale2 = sent*dale
        dale3=syll*dale
        ari1 = lexicon*ari
        ari2 = sent*ari
        ari3 = syll*ari
        cl1 = lexicon*cl
        cl2 = sent*cl
        cl3 = syll*cl
        x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1,                 smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3]
    return(x)
Пример #2
0
def calculate_statistics(lyrics):
    """
    Calculates statistics based on the text_raw of the lyrics.
    :return: Annotated lyrics containing information about the songs
    """
    logging.info("Calculating Statistics")
    from textstat.textstat import textstat
    for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)):
        try:
            song["num_syllables"] = textstat.syllable_count(song["text_raw"])
            song["num_words"] = textstat.lexicon_count(song["text_raw"])
            song["num_sentences"] = textstat.sentence_count(song["text_raw"])
            song["flesch_score"] = textstat.flesch_reading_ease(
                song["text_raw"])
            song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade(
                song["text_raw"])
            song["fog_score"] = textstat.gunning_fog(song["text_raw"])
            song[
                "num_difficult_words"] = textstat.dale_chall_readability_score(
                    song["text_raw"])
        except Exception as e:
            logging.error(
                "Something bad happened in the current song ! Skipping it... \n{}"
                .format(song))
            logging.exception(e)
    return lyrics
Пример #3
0
def _get_detailed_stats(no_code_text):
    """
    Returns detailed stats on text
    :param no_code_text: String to analyse
    :return: list of details
    """
    results = []
    group_by = 'Detailed Text Statistics'
    tb = TextBlob(no_code_text)
    # Spell check here...it's very slow
    results.append(
        TextFeature('Number of sentences',
                    textstat.sentence_count(no_code_text), group_by))
    results.append(
        TextFeature('Number of sentences (again)', len(tb.sentences),
                    group_by))
    results.append(TextFeature('Number of words', len(tb.words), group_by))
    results.append(
        TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by))
    results.append(
        TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity,
                    group_by))
    results.append(
        TextFeature('Detected Language', tb.detect_language(), group_by))
    results.append(
        TextFeature('Number of important phrases', len(tb.noun_phrases),
                    group_by))
    results.append(
        TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by))
    results.append(
        TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by))
    results.append(
        TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by))
    return results
Пример #4
0
    def _calculate_scores(self, docs):
        docs_scores = []

        for doc in docs:
            scores = {}
            scores['chars'] = ts.char_count(doc)
            scores['words'] = ts.lexicon_count(doc)
            scores['sents'] = ts.sentence_count(doc)
            #scores['syllables'] = ts.syllable_count(doc)
            scores['avg_sent_length'] = ts.avg_sentence_length(doc)
            scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc)
            scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc)
            scores['flesch'] = ts.flesch_reading_ease(doc)
            #scores['smog'] = ts.smog_index(doc)
            #scores['coleman_liau'] = ts.coleman_liau_index(doc)
            scores['automated_readability'] = ts.automated_readability_index(
                doc)
            #scores['linsear'] = ts.linsear_write_formula(doc)
            #scores['difficult_words'] = ts.difficult_words(doc)
            scores['dale_chall'] = ts.dale_chall_readability_score(doc)
            #scores['gunning_fog'] = ts.gunning_fog(doc)
            scores['lix'] = ts.lix(doc)
            docs_scores.append(scores)

        return docs_scores
Пример #5
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        "statistics": {
            "syllables": textstat.syllable_count(text),
            "words": textstat.lexicon_count(text),
            "characters": textstat.char_count(text),
            "polysyllables": textstat.polysyllabcount(text),
            "average letter per word": textstat.avg_letter_per_word(text),
            "average sentence length": textstat.avg_sentence_length(text),
            "average sentence per word": textstat.avg_sentence_per_word(text),
            "sentences": textstat.sentence_count(text),
        },
        "difficulty": {
            "flesch reading ease": textstat.flesch_reading_ease(text),
            "smog index": textstat.smog_index(text),
            "flesch kincaid grade": textstat.flesch_kincaid_grade(text),
            "coleman liau index": textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            "gunning fog": textstat.gunning_fog(text),
        },
        "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity},
    }

    return main
Пример #6
0
 def do_text_stats(self, text):
     ### Syllable Count
     syllable_count = textstat.syllable_count(text)
     ### Lexicon Count
     lexicon_count = textstat.lexicon_count(text, True)
     ### Sentence Count
     sentence_count = textstat.sentence_count(text)
     ### The Flesch Reading Ease formula
     try:
         flesch_reading_ease = textstat.flesch_reading_ease(text)
     except TypeError as e:
         flesch_reading_ease = None
     #* 90-100 : Very Easy
     #* 80-89 : Easy
     #* 70-79 : Fairly Easy
     #* 60-69 : Standard
     #* 50-59 : Fairly Difficult
     #* 30-49 : Difficult
     #* 0-29 : Very Confusing
     ### The The Flesch-Kincaid Grade Level
     try:
         flesch_kincaid_grade = textstat.flesch_kincaid_grade(text)
     except TypeError as e:
         flesch_kincaid_grade = None
     ## The Fog Scale (Gunning FOG Formula)
     gunning_fog = textstat.gunning_fog(text)
     ### The SMOG Index
     smog_index = textstat.smog_index(text)
     ### Automated Readability Index
     automated_readability_index = textstat.automated_readability_index(
         text)
     ### The Coleman-Liau Index
     try:
         coleman_liau_index = textstat.coleman_liau_index(text)
     except TypeError as e:
         coleman_liau_index = None
     ### Linsear Write Formula
     linsear_write_formula = textstat.linsear_write_formula(text)
     ### Dale-Chall Readability Score
     dale_chall_readability_score = textstat.dale_chall_readability_score(
         text)
     ### Readability Consensus based upon all the above tests
     try:
         text_standard = textstat.text_standard(text)
     except TypeError as e:
         text_standard = None
     return {
         "syllable_count": syllable_count,
         "lexicon_count": lexicon_count,
         "sentence_count": sentence_count,
         "flesch_reading_ease": flesch_reading_ease,
         "flesch_kincaid_grade": flesch_kincaid_grade,
         "gunning_fog": gunning_fog,
         "smog_index": smog_index,
         "automated_readability_index": automated_readability_index,
         "coleman_liau_index": coleman_liau_index,
         "linsear_write_formula": linsear_write_formula,
         "dale_chall_readability_score": dale_chall_readability_score,
         "text_standard": text_standard
     }
Пример #7
0
def get_special_metrics(text):
    blob = TextBlob(text)
    main = {
        'statistics': {
            'syllables': textstat.syllable_count(text),
            'words': textstat.lexicon_count(text),
            'characters': textstat.char_count(text),
            'polysyllables': textstat.polysyllabcount(text),
            'average letter per word': textstat.avg_letter_per_word(text),
            'average sentence length': textstat.avg_sentence_length(text),
            'average sentence per word': textstat.avg_sentence_per_word(text),
            'sentences': textstat.sentence_count(text)
        },
        'difficulty': {
            'flesch reading ease': textstat.flesch_reading_ease(text),
            'smog index': textstat.smog_index(text),
            'flesch kincaid grade': textstat.flesch_kincaid_grade(text),
            'coleman liau index': textstat.coleman_liau_index(text),
            #'automated readability index': textstat.automated_readability_index(text),
            #'dale chall readability score': textstat.dale_chall_readability_score(text),
            #'difficult words': textstat.difficult_words(text),
            #'linsear write formula': textstat.linsear_write_formula(text),
            'gunning fog': textstat.gunning_fog(text)
        },
        'sentiments': {
            'polarity': blob.sentiment.polarity,
            'subjectivity': blob.sentiment.subjectivity
        }
    }

    return main
def feature_apply(feature_extractor, feature_vector, attribute, number_of_file):
    """
    Extract features from each document
    :param feature_extractor: function that extract features
    :param feature_vector: contains a list of features
    :param attribute: indicate if the process for gender or age feature extraction
    :param number_of_file: number of document to be processed
    :return:vector that contain the extracted features
    """
    corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/en'
    #corpus_root = '/root/Downloads/TextMining/pan13-author-profiling-training-corpus-2013-01-09/meTets'
    newcorpus = XMLCorpusReader(corpus_root, '.*')
    i=0
    feature_set = []
    doc_list = newcorpus.fileids()
    print len(doc_list)

    for doc in doc_list[:number_of_file]:
        i+=1
        if i%50==0:
            print i
        doc = newcorpus.xml(doc)
        number_of_conversation=int(doc[0].attrib["count"])
        #print(doc[0].attrib["count"])
        txt = " ".join([doc[0][j].text for j in range(number_of_conversation) if doc[0][j].text is not None])
        #print txt
        if textstat.sentence_count(txt) != 0:
            feature_set.append((feature_extractor(txt, feature_vector), doc.attrib[attribute]))

    return feature_set
Пример #9
0
def main():
    csv_file2 = open(sys.argv[2], 'w', encoding="utf8")
    writer = csv.writer(csv_file2, delimiter=',')
    doc_id = 1
    writer.writerow(["ID", "URL", "text", "impact-score", "readability", "grade-level", "smog-index", "total-words", "total-sentences"])
    with open(sys.argv[1], 'r',  encoding="utf8", errors='ignore') as csv_file1:
        reader = csv.reader(csv_file1)
        # Skip the first line with headers
        next(reader)
        for row in reader:
            impact = str(row[0])
            url = str(row[1])
            text = str(row[2])
            read_ease = textstat.flesch_reading_ease(text)
            grade = textstat.flesch_kincaid_grade(text)
            smog = textstat.smog_index(text)
            words = textstat.lexicon_count(text)
            sentences = textstat.sentence_count(text)
            # Uncomment this if we want summary and key words
            # summary = summarize(text, ratio=0.3)
            # key_words = keywords(text, ratio=0.3)

            writer.writerow([doc_id]+[url]+[text]+[impact]+[read_ease]+[grade]+[smog]+[words]+[sentences])
            doc_id = doc_id+1
    csv_file1.close()
    csv_file2.close()

    print('Summary statistics complete!')
Пример #10
0
def analyse_plain_text(test_data):
    text_stats = TextStats()

    # Do some simple analysis.
    from textblob import TextBlob
    zen = TextBlob(test_data)
    text_stats.word_count = len(zen.words)
    text_stats.sentence_count = len(zen.sentences)
    text_stats.polarity = zen.sentiment.polarity
    text_stats.subjectivity = zen.sentiment.subjectivity

    # Easy to read, this?
    from textstat.textstat import textstat
    text_stats.flesch_reading_ease = textstat.flesch_reading_ease(test_data)

    # Words per sentence count.
    from textstat.textstat import textstat
    text_stats.word_per_sentence_count = (
        textstat.lexicon_count(test_data, False) /
        textstat.sentence_count(test_data))

    # Convert all to lower.
    test_data = test_data.lower()

    # Tokenise.
    from nltk.tokenize import word_tokenize
    words = word_tokenize(test_data)

    # Tokenise stemmed text.
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    test_data_stemmed = ''
    for w in words:
        test_data_stemmed = test_data_stemmed + ' ' + ps.stem(w)
    stemmed_words = word_tokenize(test_data_stemmed)

    # Remove non-words.
    nonPunct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
    filtered = [w for w in stemmed_words if nonPunct.match(w)]

    # Remove stopwords:
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    extra_stopwords = set([
        'that', '\'s', 'wa', 'thi', 'like', 'n\'t', 'would', 'ha', 'us', 'get'
    ])
    filtered = [
        w for w in filtered if w not in stopwords and w not in extra_stopwords
    ]

    # How many unique words?
    from collections import Counter
    counts = Counter(filtered)
    text_stats.unique_word_count = len(counts)

    # Words sorted by most common.
    text_stats.counts = counts

    return text_stats
Пример #11
0
def flesch_kincaid_score(text):
	sylCount = textstat.syllable_count(text)
	wordCount = len(text.split())
	sentenceCount = textstat.sentence_count(text)

	print "Syl count - %s, word count - %s, sentenceCount - %s " % (sylCount,wordCount,sentenceCount)

	return (0.39*(wordCount/sentenceCount)+11.8*(sylCount/wordCount) - 15.59)
Пример #12
0
def composition(text, file):
    char_count = textstat.char_count(text)
    syll_count = textstat.syllable_count(text)
    lex_count = textstat.lexicon_count(text)
    sent_count = textstat.sentence_count(text)
    file.write(
        '\nChar count : %d\nSyllabus count : %d \nLexicon count : %d \nSentence count : %d'
        % (char_count, syll_count, lex_count, sent_count))
Пример #13
0
def get_stats(sentence):
	syllables = textstat.syllable_count(sentence)
	words = textstat.lexicon_count(sentence, True)
	sentence_count = textstat.sentence_count(sentence)

	if sentence_count > 0:
		text_standard = textstat.text_standard(sentence)
	else:
		text_standard = EMPTY_TEXT_STANDARD

	text_standard = fix_grammar_errors(text_standard)

	return combine(syllables, words, sentence_count, text_standard)
Пример #14
0
def displayResults( path ):
	print "stats"
	text = loadText(path)
	raw_tokens = raw_tokenize(text)
	print "number of words %s" %count_words(text)
	print "number of sentences %s" %textstat.sentence_count(text)
	print "uniques words: %s" %len(set(raw_tokenize(text)))
	print "Difficulty %s / 100 " %(100 - textstat.flesch_reading_ease(text))
	print "Average sentiment %s (negative: 0, neutral: 5, positive: 10)"%calculateSentiment(raw_tokens)
	print
	print "topic distribution"
	displayTopicsDistributionWithinTheText(path, 300, pie = False)
	print "difficulty over the text "
	complexityAlongtheText( path, 300)
Пример #15
0
 def __load_text(self):
     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
     with codecs.open('{}/{}'.format(local_data_dir, self.filename), 'r', encoding = 'utf8', errors = 'ignore') as f:
         data = f.read()
     self.flesch_reading_ease = textstat.flesch_reading_ease(data)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data)
     sentences = tokenizer.tokenize(data)
     self.n_sentences = textstat.sentence_count(data)
     self.avg_sentence_length = textstat.lexicon_count(data, True) * 1. / self.n_sentences
     self.avg_word_length = np.mean([len(w) for s in sentences for w in s.split(' ') if w not in stopwords.words('english')])
     print 'Parse ', len(sentences), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length
     self.sentences = sentences
     self.tokens = []
     [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
Пример #16
0
def averageSentenceNumber(dataset, data):
    count = 0
    finalfeatureset = []
    for author in data:
        sentencecount = 0
        n = author['numemails']
        featureset = []
        for item in author['mailset']:
            sentencecount += textstat.sentence_count(item['text'])
        featureset = dataset[count]['featureSet']
        featureset.append({'averageSentenceCount': float(float(sentencecount) / float(n))})
        count += 1
        finalfeatureset.append({'author': author['author'], 'featureSet': featureset})

    return finalfeatureset
Пример #17
0
def displayResults(path):
    print "stats"
    text = loadText(path)
    raw_tokens = raw_tokenize(text)
    print "number of words %s" % count_words(text)
    print "number of sentences %s" % textstat.sentence_count(text)
    print "uniques words: %s" % len(set(raw_tokenize(text)))
    print "Difficulty %s / 100 " % (100 - textstat.flesch_reading_ease(text))
    print "Average sentiment %s (negative: 0, neutral: 5, positive: 10)" % calculateSentiment(
        raw_tokens)
    print
    print "topic distribution"
    displayTopicsDistributionWithinTheText(path, 300, pie=False)
    print "difficulty over the text "
    complexityAlongtheText(path, 300)
Пример #18
0
    def analyze_one(self, email):
        """ Analyzes a single email and stores results. """

        sents = tstat.sentence_count(email)
        self.sent_count.append(sents if sents > 0 else 1)

        if email and len(email) > 0:
            self.flesch_kincaid_grade.append(tstat.flesch_kincaid_grade(email))
            self.automated_readability_index.append(
                tstat.automated_readability_index(email))
            self.coleman_liau_index.append(tstat.coleman_liau_index(email))
            self.linsear_write_formula.append(
                tstat.linsear_write_formula(email))
            self.dale_chall_readability_score.append(
                tstat.dale_chall_readability_score(email))
Пример #19
0
def gettingFeatures(text):
    text = text.lower()

    #words / syllables / sentences count
    wordCount = len(text.split())
    syllables = textstat.syllable_count(text)
    sentences = textstat.sentence_count(text)
    try:
        #ReadabilityScore
        readabilityScore = 206.835 - 1.015 * (wordCount / sentences) - 84.6 * (
            syllables / wordCount)
        #ReadabilityGrade
        ReadabilityGrade = 0.39 * (wordCount / sentences) + 11.8 * (
            syllables / wordCount) - 15.59
    except:
        readabilityScore = 0
        ReadabilityGrade = 0
    print(readabilityScore, ReadabilityGrade)
    #Direction Count
    #private String[] direction = {"here", "there", "over there", "beyond", "nearly", "opposite", "under", "above", "to the left", "to the right", "in the distance"};
    DiractionCount = 0
    DiractionCount = text.count("here") + text.count("there") + text.count(
        "over there") + text.count("beyond") + text.count(
            "nearly") + text.count("opposite") + text.count(
                "under") + text.count("to the left") + text.count(
                    "to the right") + text.count("in the distance")
    #Exemplify count
    #private String[] exemplify = {"chiefly", "especially", "for instance", "in particular", "markedly", "namely", "particularly", "including", "specifically", "such as"};
    Exemplify = 0
    Exemplify = text.count("chiefly") + text.count("especially") + text.count(
        "for instance") + text.count("in particular") + text.count(
            "markedly") + text.count("namely") + text.count(
                "particularly") + text.count("incluiding") + text.count(
                    "specifically") + text.count("such as")

    try:
        #words per sentence (average)
        WPS = 0
        parts = [len(l.split()) for l in re.split(r'[?!.]', text) if l.strip()]
        WPS = sum(parts) / len(parts)  #number of words per sentence
    except:
        WPS = 0
    #print(wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS, Exemplify)
    return numpy.array([
        wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS,
        Exemplify
    ])
Пример #20
0
def main() :
  for arg in sys.argv[1:]:
    with open(arg) as f:
      text = f.read()

    with open(arg + '.readability.snip','w') as f:
       f.write ("syllable_count : %s\n" % textstat.syllable_count(text))
       f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text))
       f.write ("sentence_count : %s\n" % textstat.sentence_count(text))
       f.write ("difficult_words : %s\n" % textstat.difficult_words(text))
       f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text))
       f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text))
       f.write ("smog_index : %s\n" % textstat.smog_index(text))
       f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text))
       f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text))
       f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text))
       f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
Пример #21
0
def process_comment(comment):
    login = comment["user"]["login"]
    body = comment["body"]
    
    yield Feature("comment_count", login, 1) 
    if "RFR" in body and "not RFR" not in body:
        yield Feature('RFR', login, 1)
    if "RFM" in body and "not RFM" not in body:
        yield Feature("RFM", login, 1)
    if "LGTM" in body:
        yield Feature("LGTM", login, 1)
    if "PTAL" in body:
        yield Feature("PTAL", login, 1)
    if r"```" in body:
        yield Feature("code_block", login, 1)
    if r"@" in body:
        yield Feature("mention", login, 1)
    if "![" in body:
        yield Feature("image", login, 1)
    if " [ ]" in body or " [x]" in body:
        yield Feature("checklist", login, 1)
    for field in [":thumbsup:", "+1", ":ship:", ":shipit:", ":rocket:"]:
        if field in body:
            yield Feature(field, login, 1)
    
    txt = _clean_body(body)
    if not txt:
        return
    # yield Feature("avg_sentences_per_comment", login, textstat.sentence_count(txt))
    yield Feature("sentences", login, textstat.sentence_count(txt))

    if 'https://' in txt or 'http://' in txt:
        yield Feature('with_link', login, 1)
    issues = re.findall("#[0-9]{4,5}", txt)
    if issues:
        yield Feature("issue_crosslink", login, len(issues))

    issues = re.findall("\b(CS|BITLY|DATA|DEVOPS)-[0-9]{3,4}\b", txt)
    if issues:
        yield Feature("jira_crosslink", login, len(issues))
    
    if comment.get('issue_url'):
        issue_number = comment['issue_url'].split('/')[-1]
        if cached_issue_assignee(issue_number) == login:
            yield Feature("self_comment", login, 1)
Пример #22
0
def scores_cal_ori(text):

              char_count_value=textstat.char_count(text,ignore_spaces=True)
              lexicon_count_value=textstat.lexicon_count(text,removepunct=True)
              syllable_count_value=textstat.syllable_count(text)
              sentence_count_value=textstat.sentence_count(text)
              avg_sentence_length_value=textstat.avg_sentence_length(text)
              avg_syllables_per_word_value=textstat.avg_syllables_per_word(text)
              avg_letter_per_word_value=textstat.avg_letter_per_word(text)
              avg_sentence_per_word_value=textstat.avg_sentence_per_word(text)
              flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text)
              smog_index_value=textstat.smog_index(text)
              gunning_fog_value=textstat.gunning_fog(text)
              difficult_words_value=textstat.difficult_words(text)
              dale_chall_value=textstat.dale_chall_readability_score(text)
              polysyllab_value=textstat.polysyllabcount(text)
              return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value
              return smog_index_value
Пример #23
0
def analyse_json(json_text):
    # consider moving this to be a feature of Transcript in the other module

    df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name',
                                         'syllable_count','lexicon_count',
                                         'sentence_count',
                                         'syllables_per_word',
                                         'gunning_fog', 'smog_index',
                                         'text_standard'],
                      index=[])

    trscrpt = json.loads(json_text)
    if 'witnesses' in trscrpt:
        witnesses = trscrpt['witnesses']


        for s in trscrpt['all_sections']:
            if 'speaker' in s and 'person' in s['speaker'] and \
                    s['speaker']['person']['speaker_type']=='witness':
                witness =  witnesses[s['speaker']['person']['name']]
                witness.setdefault('all_text', []).append(s['spoken_text'])

        for i, p in enumerate(witnesses):
            if 'all_text' in witnesses[p]:
                witness_text = '\n\n'.join(witnesses[p]['all_text'])
                if len(witness_text) > 0:
                    stats_data = {'html_file_location': trscrpt['html_file_location'],
                                  'witness_name': p,
                                  'syllable_count': textstat.syllable_count(witness_text),
                                  'lexicon_count': textstat.lexicon_count(witness_text),
                                  'sentence_count': textstat.sentence_count(witness_text),
                                  'syllables_per_word': textstat.avg_syllables_per_word(witness_text),
                                  'gunning_fog': textstat.gunning_fog(witness_text),
                                  'smog_index': textstat.smog_index(witness_text),
                                  'text_standard': textstat.text_standard(witness_text)}
                    df_witnesses.loc['witness_%i' % i] = stats_data
                else:
                    df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                    df_witnesses.loc['witness_%i' % i, 'witness_name'] = p
            else:
                df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                df_witnesses.loc['witness_%i' % i, 'witness_name'] = p

    return df_witnesses
Пример #24
0
	def getReadingLevel(subreddit):
		query = '''SELECT body FROM 
		(SELECT body, RAND() AS r1
		FROM [fh-bigquery:reddit_comments.''' + str(year) + ''']
		WHERE subreddit == "''' + subreddit + '''"  
		AND body != "[deleted]"
		AND body != "[removed]"
		AND score > 1
		ORDER BY r1
		LIMIT 1000)
		'''

		bigquery_service = build('bigquery', 'v2', credentials=credentials)
		try:
			query_request = bigquery_service.jobs()
			query_data = {
				'query': query,
				'timeoutMs': 20000
			}

			query_response = query_request.query(
				projectId=bigquery_pid,
				body=query_data).execute()

		except HttpError as err:
			print('Error: {}'.format(err.content))
			raise err

		rows = query_response['rows']

		levels_sum = 0.0
		levels_count = 0
		for i in range(len(rows)):
			text = rows[i]['f'][0]['v']
			text = re.sub('([A-Za-z]+:\/\/[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)|([A-Za-z]+\.[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)', '', text) #url get rid
			text = re.sub('\s\s+', ' ', text)
			if textstat.sentence_count(text) > 0:
				levels_sum += textstat.flesch_reading_ease(text)
				levels_count += 1

		average_level = 0.0
		if levels_count > 0:
			average_level = levels_sum / levels_count
			results[subreddits.index(subreddit)] = [subreddit, 100.0 - average_level]
def compute_syllables(text):
    num_sentence = textstat.sentence_count(text)
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    word_list = text.split()
    num_simple = 0
    num_complex = 0
    num_syllables = 0
    for i in word_list:
        try:
            syllables = nsyl(i)
            if syllables >= 3:
                num_complex += 1
                num_syllables = num_syllables + syllables
            else:
                num_simple += 1
                num_syllables = num_syllables + syllables
        except:
            continue
    return [num_simple, num_complex, num_syllables, num_sentence]
def _get_detailed_stats(no_code_text):
    """
    Returns detailed stats on text
    :param no_code_text: String to analyse
    :return: list of details
    """
    results = []
    group_by = 'Detailed Text Statistics'
    tb = TextBlob(no_code_text)
    # Spell check here...it's very slow
    results.append(TextFeature('Number of sentences', textstat.sentence_count(no_code_text), group_by))
    results.append(TextFeature('Number of sentences (again)', len(tb.sentences), group_by))
    results.append(TextFeature('Number of words', len(tb.words), group_by))
    results.append(TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by))
    results.append(TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity, group_by))
    results.append(TextFeature('Detected Language', tb.detect_language(), group_by))
    results.append(TextFeature('Number of important phrases', len(tb.noun_phrases), group_by))
    results.append(TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by))
    results.append(TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by))
    results.append(TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by))
    return results
Пример #27
0
def analyseText():
    values = request.get_json()
    required = [ 'inputText' ]
    if not all(k in values for k in required):
        return 'Missing values', 400

    text = values['inputText']
    result = {
        'syllable_count': textstat.syllable_count(text),
        'lexicon_count': textstat.lexicon_count(text),
        'sentence_count': textstat.sentence_count(text),
        'flesch_reading_ease': textstat.flesch_reading_ease(text),
        'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text),
        'gunning_fog': textstat.gunning_fog(text),
        'smog_index': textstat.smog_index(text),
        'automated_readability_index': textstat.automated_readability_index(text),
        'coleman_liau_index': textstat.coleman_liau_index(text),
        'linsear_write_formula': textstat.linsear_write_formula(text),
        'dale_chall_readability_score': textstat.dale_chall_readability_score(text)
    };

    return jsonify(result), 200
def test_set(corpus_dir, feature_extrator, vect_path, i):
    """
    Read ,process the test set and extract features for each document
    :param corpus_dir:path of the test set
    :param feature_extrator: function that extract features
    :param vect_path:
    :param i:index of class in the true_pred dictionay values; if 0 it refers to the gender else it refers to the age
    :return:vector that contain the extracted features
    """
    vect = create_feature_vect(vect_path)
    newcorpus = XMLCorpusReader(corpus_dir, '.*')
    doc_list = newcorpus.fileids()
    test_feature_set = []
    true_pred = extract_true_pred(corpus_dir[:-2]+"truth-en.txt")
    for doc in doc_list:
        xml_name = doc
        doc = newcorpus.xml(doc)
        print(doc[0].attrib["count"])
        txt = fetch_text(doc)
        if (textstat.sentence_count(txt) != 0) and (txt != ""):
            test_feature_set.append((feature_extrator(txt, vect), true_pred[xml_name][i]))

    return test_feature_set
Пример #29
0
 def __load_text(self):
     tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
     with codecs.open('{}/{}'.format(local_data_dir, self.filename),
                      'r',
                      encoding='utf8',
                      errors='ignore') as f:
         data = f.read()
     self.flesch_reading_ease = textstat.flesch_reading_ease(data)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(data)
     sentences = tokenizer.tokenize(data)
     self.n_sentences = textstat.sentence_count(data)
     self.avg_sentence_length = textstat.lexicon_count(
         data, True) * 1. / self.n_sentences
     self.avg_word_length = np.mean([
         len(w) for s in sentences for w in s.split(' ')
         if w not in stopwords.words('english')
     ])
     print 'Parse ', len(
         sentences
     ), ' sentences, average sentence length ', self.avg_sentence_length, ', average word length ', self.avg_word_length
     self.sentences = sentences
     self.tokens = []
     [self.tokens.extend(text_tokenize(sentence)) for sentence in sentences]
Пример #30
0
 def stats(self, text):
     test_data = text
     stats = {}
     stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     stats['smog'] = textstat.smog_index(test_data)
     stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data)
     stats['coleman Liau'] = textstat.coleman_liau_index(test_data)
     stats['automated'] = textstat.automated_readability_index(test_data)
     stats['dale chall'] = textstat.dale_chall_readability_score(test_data)
     stats['difficult'] = textstat.difficult_words(test_data)
     stats['linsear'] = textstat.linsear_write_formula(test_data)
     stats['gunning_fog'] = textstat.gunning_fog(test_data)
     stats['standard'] = textstat.text_standard(test_data)
     stats['charcount'] = textstat.char_count(test_data)
     stats['lexicon count'] = textstat.lexicon_count(test_data)
     stats['syllable count'] = textstat.syllable_count(test_data)
     stats['sentence count'] = textstat.sentence_count(test_data)
     stats['avg sentence length'] = textstat.avg_sentence_length(test_data)
     stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word(
         test_data)
     stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data)
     stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word(
         test_data)
     return stats
Пример #31
0
	ar_index_grades = []
	ar_index_total_grade = 0
	# Coleman-Liau index: goo.gl/8sE0m1
	cl_index_grades = []
	cl_index_total_grade = 0
	# Linsear Write Formula: goo.gl/GuOZ8B
	lwf_grades = []
	lwf_total_grade = 0
	# Dale-Chall Readability Score: goo.gl/dvmXmx
	dcr_grades = []
	dcr_total_grade = 0		
	
	num_tweets = 0
	for tweet in cleanest_tweets:
			# skipping tweets which are not just contextbased text. 
			if textstat.sentence_count(tweet) < 1:
				continue
			flesch_kincaid_grade = textstat.flesch_kincaid_grade(tweet)	
			flesch_kincaid_grades.append(flesch_kincaid_grade)
			flesch_kincaid_total_grade += flesch_kincaid_grade

			gunning_fog_grade = textstat.gunning_fog(tweet)	
			gunning_fog_grades.append(gunning_fog_grade)
			gunning_fog_total_grade += gunning_fog_grade

			smog_index_grade = textstat.smog_index(tweet)	
			smog_index_grades.append(smog_index_grade)
			smog_index_total_grade += smog_index_grade

			ar_index_grade = textstat.automated_readability_index(tweet)	
			ar_index_grades.append(ar_index_grade)
Пример #32
0
    def updateData(self):

        # Full list of polarity scores
        self.polscore = self.sid.polarity_scores(self.text)

        ##### INDEX 0 IN DATA: Text Sentiment #####
        # [INDEX 0] Compounded score (0.0 - 1.0)            [INDEX 1] Negative connotation rating (0.0 - 1.0),
        # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0)
        self.data.append([
            self.polscore['compound'], self.polscore['neg'],
            self.polscore['pos'], self.polscore['neu']
        ])

        ##### INDEX 1 IN DATA: Sentence Info #####
        # [INDEX 0] Sentence count          [INDEX 1] Average sentence length
        # [INDEX 2] Syllable count          [INDEX 3] Overall word count
        # [INDEX 4] Character count         [INDEX 5] Character count without spaces
        # [INDEX 6] Avg letters per word    [INDEX 7] Avg syllables per word
        self.data.append([
            textstat.sentence_count(self.text),
            textstat.avg_sentence_length(self.text),
            textstat.syllable_count(self.text),
            len(self.splList),
            textstat.char_count(self.text, False),
            textstat.char_count(self.text, True),
            textstat.avg_letter_per_word(self.text),
            textstat.avg_syllables_per_word(self.text)
        ])

        ##### INDEX 2 IN DATA: Flesch Reading Ease #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 100
        self.freRaw = textstat.flesch_reading_ease(self.text)
        self.freStat = min(max(self.freRaw, 0), 100)
        self.data.append([
            round(self.freStat, 3),
            self.freGrade(self.freStat),
            round(abs(self.freStat - 100), 2)
        ])

        ##### INDEX 3 IN DATA: Flesch-Kincaid Grade #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fkgRaw = textstat.flesch_kincaid_grade(self.text)
        self.fkgStat = self.adjustScore(self.fkgRaw)
        self.data.append([
            round(self.fkgStat, 3),
            self.grade(self.fkgStat),
            round(self.fkgStat / 0.18, 2)
        ])

        ##### INDEX 4 IN DATA: Gunning FOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fogRaw = textstat.gunning_fog(self.text)
        self.fogStat = self.adjustScore(self.fogRaw)
        self.data.append([
            round(self.fogStat, 3),
            self.grade(self.fogStat),
            round(self.fogStat / 0.18, 2)
        ])

        ##### INDEX 5 IN DATA: SMOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.smogRaw = textstat.smog_index(self.text)
        self.smogStat = self.adjustScore(self.smogRaw)
        self.data.append([
            round(self.smogStat, 3),
            self.grade(self.smogStat),
            round(self.smogStat / 0.18, 2)
        ])

        ##### INDEX 6 IN DATA: Automated Readability Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 14
        self.ariRaw = textstat.automated_readability_index(self.text)
        self.ariStat = min(max(self.ariRaw, 0), 14)
        self.data.append([
            round(self.ariStat, 3),
            self.ariGrade(ceil(self.ariStat)),
            round(self.ariStat / 0.14, 2)
        ])  #13

        ##### INDEX 7 IN DATA: Coleman-Liau Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.cliRaw = textstat.coleman_liau_index(self.text)
        self.cliStat = self.adjustScore(self.cliRaw)
        self.data.append([
            round(self.cliStat, 3),
            self.grade(self.cliStat),
            round(self.cliStat / 0.18, 2)
        ])

        ##### INDEX 8 IN DATA: Linsear Write Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.lwiRaw = textstat.linsear_write_formula(self.text)
        self.lwiStat = self.adjustScore(self.lwiRaw)
        self.data.append([
            round(self.lwiStat, 3),
            self.grade(self.lwiStat),
            round(self.lwiStat / 0.18, 2)
        ])

        ##### INDEX 9 IN DATA: Dale-Chall Readability Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 10
        self.dcrRaw = textstat.dale_chall_readability_score(self.text)
        self.dcrStat = min(max(self.dcrRaw, 0), 10)
        self.data.append([
            round(self.dcrStat, 3),
            self.daleChallGrade(self.dcrStat),
            round(self.dcrStat / 0.1, 2)
        ])

        ##### INDEX 10 IN DATA: Overall Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 20
        self.txtRaw = textstat.text_standard(self.text, True)
        self.txtStd = min(max(self.txtRaw, 0), 20)
        self.txtInfo = textstat.text_standard(self.text)
        self.data.append([
            round(self.txtStd, 3),
            self.txtGrade(self.txtStd, self.txtInfo),
            round(self.txtStd / 0.2, 2)
        ])

        return self.data
                    try:
                        cur = {
                            "title": title,
                            "artist": artist,
                            "year": year,
                            "pos": pos,
                            "lyrics": lyrics,
                            "tags": get_tags(artist),
                            "sentiment": sent_analyzer.polarity_scores(lyrics_repl),
                            "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl),
                            "flesch_index": ts.flesch_reading_ease(lyrics_repl),
                            "fog_index": ts.gunning_fog(lyrics_repl),
                            "difficult_words": ts.difficult_words(lyrics_repl),
                            "num_syllables": ts.syllable_count(lyrics_repl),
                            "num_words": ts.lexicon_count(lyrics_repl, True),
                            "num_lines": ts.sentence_count(lyrics_repl),
                            "num_dupes": count_dupes(lyrics)
                        }
                        # print cur
                        dataset.append(cur)
                    except Exception, e:
                        print e

            except Exception, e:
                print "Exception occurred for " + artist + ' - ' + title
                print e

    outfile = "years/" + str(year) + '.txt'
    dir = os.path.dirname(outfile)
    if not os.path.exists(dir):
        os.makedirs(dir)
Пример #34
0
#main script
if __name__ == '__main__':

	print "TextStat Comparison Script"
	print "--------------------------"
	
	#read in text from the command line
	#This needs to be fixed to deal/escape special characters
	textToCheck = raw_input("Please enter the text you would like to analyse: ") 
	
	#read in text from a file- but what format?
	
	print "\n\n"
	print "Results"
	print "=============================================="
	print "==============================================\n"
	
	print "Syllable Count: " + str(textstat.syllable_count(textToCheck))
	print "Lexicon Count: " + str(textstat.lexicon_count(textToCheck)) #TRUE is default and removes punctuation before counting
	print "Sentence Count: " + str(textstat.sentence_count(textToCheck))
	print "Flesch Reading Ease formula: " + str(textstat.flesch_reading_ease(textToCheck))
	print "Flesch-Kincaid Grade Level: " + str(textstat.flesch_kincaid_grade(textToCheck))
	print "Fog Scale (Gunning FOG Formula): " + str(textstat.gunning_fog(textToCheck))
	print "SMOG Index: " + str(textstat.smog_index(textToCheck))
	print "Automated Readability Index: " + str(textstat.automated_readability_index(textToCheck))
	print "Coleman-Liau Index: " + str(textstat.coleman_liau_index(textToCheck))
	print "Linsear Write Formula: " + str(textstat.linsear_write_formula(textToCheck))
	print "Dale-Chall Readability Score: " + str(textstat.dale_chall_readability_score(textToCheck))
	print "--------------------------------------------------------------"
	print "Readability Consensus based upon all the above tests: " + str(textstat.text_standard(textToCheck))
	print "\n\n"
Пример #35
0
def get_textstats(text): 
	return textstat.sentence_count(text), textstat.automated_readability_index(text), textstat.flesch_reading_ease(text)
Пример #36
0
        print(
            "-------------------------Text Statistic-----------------------------------"
        )
        print("Returns the number of syllables present in the given text.")
        # print(textstat.syllable_count(test_data, lang='en_US'))
        num_syllables = textstat.syllable_count(test_data, lang='en_US')
        print(num_syllables)
        print(
            "Calculates the number of words present in the text - punctuation removed"
        )
        # print(textstat.lexicon_count(test_data, removepunct=True))
        num_words = textstat.lexicon_count(test_data, removepunct=True)
        print(num_words)
        print("Returns the number of sentences present in the given text.")
        # print(textstat.sentence_count(test_data))
        num_sentences = textstat.sentence_count(test_data)
        print(num_sentences)
        print("difficult words")
        # print(textstat.difficult_words(test_data))
        num_difficult_words = textstat.difficult_words(test_data)
        print(num_difficult_words)

        print(
            "-------------------------Difficulty------------------------------"
        )
        print("The Flesch Reading Ease Score")
        # print(textstat.flesch_reading_ease(test_data))
        difficulty_score = textstat.flesch_reading_ease(test_data)
        print(difficulty_score)

        if 0 <= difficulty_score < 30:
Пример #37
0
def gettingFeatures(text):
    text = text.lower()

    #words / syllables / sentences count
    wordCount = len(text.split())
    syllables = textstat.syllable_count(text)
    sentences = textstat.sentence_count(text)
    try:
        #ReadabilityScore
        readabilityScore = 206.835 - 1.015 * (wordCount / sentences) - 84.6 * (
            syllables / wordCount)
        #ReadabilityGrade
        ReadabilityGrade = 0.39 * (wordCount / sentences) + 11.8 * (
            syllables / wordCount) - 15.59
    except:
        readabilityScore = 0
        ReadabilityGrade = 0
    #Direction Count
    #private String[] direction = {"here", "there", "over there", "beyond", "nearly", "opposite", "under", "above", "to the left", "to the right", "in the distance"};
    DiractionCount = 0
    DiractionCount = text.count("here") + text.count("there") + text.count(
        "over there") + text.count("beyond") + text.count(
            "nearly") + text.count("opposite") + text.count(
                "under") + text.count("to the left") + text.count(
                    "to the right") + text.count("in the distance")
    #Exemplify count
    #private String[] exemplify = {"chiefly", "especially", "for instance", "in particular", "markedly", "namely", "particularly", "including", "specifically", "such as"};
    Exemplify = 0
    Exemplify = text.count("chiefly") + text.count("especially") + text.count(
        "for instance") + text.count("in particular") + text.count(
            "markedly") + text.count("namely") + text.count(
                "particularly") + text.count("incluiding") + text.count(
                    "specifically") + text.count("such as")
    #Analytical thinking
    #Analytic = 0 #LIWC Analysis
    #Aunthenticity
    #Authentic  = 0 #LIWC Analysis
    #Emotional tone
    #Tone = 0 #LIWC Analysis
    try:
        #words per sentence (average)
        WPS = 0
        parts = [len(l.split()) for l in re.split(r'[?!.]', text) if l.strip()]
        WPS = sum(parts) / len(parts)  #number of words per sentence
    except:
        WPS = 0
    #Six letter words
    Sixltr = 0
    words = text.split()
    letter_count_per_word = {w: len(w) for w in words}
    for x in letter_count_per_word.values():
        if x >= 6:
            Sixltr = Sixltr + 1
    #Function words
    function = 0
    #Pronouns
    pronoun = 0
    text_tokens = word_tokenize(text)
    result = nltk.pos_tag(text_tokens)
    pronoun = len([(x, y) for x, y in result if y == "PRP" or y == "PRP$"])
    #Personal pronouns
    ppron = 0
    ppron = len([(x, y) for x, y in result if y == "PRP"])
    #I
    i = 0
    i = text.count("i")
    #You
    you = 0
    you = text.count("you")
    #Impersonal pronoun "one" / "it"
    ipron = 0
    ipron = text.count("one") + text.count("it")
    #Prepositions
    prep = 0
    prep = len([(x, y) for x, y in result if y == "IN"])
    #Auxiliary verbs do/be/have
    auxverb = 0
    auxverb = text.count("do") + text.count("does") + text.count(
        "don´t") + text.count("doesn´t") + text.count("has") + text.count(
            "have") + text.count("hasn´t") + text.count(
                "haven´t") + text.count("am") + text.count("are") + text.count(
                    "is") + text.count("´m") + text.count("´re") + text.count(
                        "´s")
    #Negations
    negate = 0
    negate = text.count("not")
    #Count interrogatives
    #interrog = 0 #LICW Analysis
    #Count numbers
    number = 0
    prep = len([(x, y) for x, y in result if y == "CD"])
    #Cognitive processes
    #cogproc = 0 #LIWC Analysis
    #Cause relationships
    #cause = 0 #LIWC Analysis
    #Discrepencies
    #discrep = 0 #LIWC Analysis
    #Tenant
    #tentat = 0 #LIWC Analysis
    #Differtiation
    #differ = 0 #LIWC Analysis
    #Perceptual processes
    #percept = 0 #LIWC Analysis
    #Verbs past focus VBD VBN
    focuspast = 0
    focuspast = len([(x, y) for x, y in result if y == "VBN" or y == "VBD"])
    #Verbs present focus VB VBP VBZ VBG
    focuspresent = 0
    focuspast = len([(x, y) for x, y in result
                     if y == "VB" or y == "VBP" or y == "VBZ" or y == "VBG"])
    #net speak
    #netspeak = 0 #LIWC Analysis
    #Assent
    #assent = 0 #LIWC Analysis
    #Non fluencies
    #nonflu = 0 #LIWC Analysis
    #Count all punctuation
    AllPunc = 0
    punctuation = "!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~"
    cd = {c: val for c, val in ct.Counter(text).items() if c in punctuation}
    for x in cd.values():
        AllPunc = AllPunc + x
    #number of commas
    Comma = 0
    Comma = text.count(",")
    #number of question marks
    QMark = 0
    QMark = text.count("?")

    #return numpy.array([wordCount,readabilityScore,ReadabilityGrade,DiractionCount,Analytic,Authentic,Tone,WPS,Sixltr,function,pronoun,ppron,i,you,ipron,prep,auxverb,negate,interrog,number,cogproc,cause,discrep,tentat,differ,percept,focuspast,focuspresent,netspeak,assent,nonflu,AllPunc,Comma,QMark,Exemplify])
    return [
        wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS,
        Sixltr, pronoun, ppron, i, you, ipron, prep, auxverb, negate, number,
        focuspast, focuspresent, AllPunc, Comma, QMark, Exemplify
    ]
        rt = response
        raw_html = response.read()
        g = goose.Goose()
        a = g.extract(raw_html=raw_html)
        htext = a.cleaned_text
        opinion = TextBlob(htext)
        pol = opinion.sentiment.polarity
        sub = opinion.sentiment.subjectivity
        rt = requests.get(qqll).elapsed.total_seconds()
        kw = str(keywords(htext, lemmatize=True))
        kw = kw.replace('\r', ' ').replace('\n', ' ')
        keyw = ' '.join(kw.split()[:3])
        sbody = htext.replace(',', '')
        fkg = textstat.flesch_kincaid_grade(htext)
        wc = textstat.lexicon_count(htext)
        sc = textstat.sentence_count(htext)
        fre = textstat.flesch_reading_ease(htext)
        sinsite = [
            'response time', 'subjective', 'polarity', 'fgrade', 'fscore',
            'words.counts', 'sentence.count', 'keywords', 'title', 'link',
            'text'
        ]
        wr.writerow(sinsite)
        insite = [rt, sub, pol, fkg, fre, wc, sc, keyw, a.title, qqll]
        wr.writerow(insite)

    rec = re.compile(r"https?://(www\.)?")
    zz = rec.sub('', qqll).strip().strip('/')
    with open('rowTwittersite.csv', 'w') as tsout:
        wr = csv.writer(tsout, quoting=csv.QUOTE_ALL)
        tnslist = [
Пример #39
0
from textstat.textstat import textstat
import re
import time
import csv

start_time = time.time()

def load_file(file_path):
    comments = []
    with open(file_path, 'r') as file_reader:
        reader = csv.reader(file_reader, delimiter=',', quotechar='"')
        reader.next()
        for row in reader:
            text = re.sub('([A-Za-z]+:\/\/[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)|([A-Za-z]+\.[A-Za-z0-9]+\.[A-Za-z0-9]+[^\s-]*)', '', row[0]) #url get rid
            text = re.sub('\s\s+', ' ', text)
            comments.append(text)
    return comments

docs = ["AskReddit2008.csv", "AskReddit2009.csv", "AskReddit2010.csv", "AskReddit2011.csv", "AskReddit2012.csv", "AskReddit2013.csv", "AskReddit2014.csv"]

for doc_path in docs:
	documents = load_file(doc_path)
	levels = [textstat.flesch_reading_ease(comment) for comment in documents if textstat.sentence_count(comment) != 0]
	print "reading level for " + doc_path
	print sum(levels)/len(levels)

elapsed_time = time.time() - start_time
print "elapsed time in seconds: " + str(elapsed_time)
Пример #40
0
if __name__ == '__main__':

    # prompt user for file and open it
    user_input = input("Enter file name to open: ")
    input_string = open(user_input).read()
    user_input = input("Enter file name to write to: ")

    # declare/initialize lists
    copy_string = input_string.split()
    words_with_synonyms = []
    the_synonyms = []

    # get number of syllables, words, sentences, and FK score for the file
    num_syllables = textstat.syllable_count(input_string)
    num_words = textstat.lexicon_count(input_string)
    num_sentences = textstat.sentence_count(input_string)
    fk_score = 206.835 - float(1.105 * (num_words / num_sentences)) - float(
        84.6 * (num_syllables / num_words))

    # print number of syllables, words, and sentences in the file
    print("\nNumber of syllables: ", num_syllables)
    print("Number of words: ", num_words)
    print("Number of sentences: ", num_sentences)

    output = synonym_replacement(input_string, copy_string)
    #output = remove_adjective(output, copy_string)
    initial_grade = check_reading_level(input_string)
    new_grade = check_reading_level(output)

    new_num_syllables = textstat.syllable_count(output)
    new_num_words = textstat.lexicon_count(output)
Пример #41
0
#!/bin/python

import sys, string, os
from textstat.textstat import textstat

inputfile = ''
test_data = ""

script_name = sys.argv[0]
inputfile = sys.argv[1]

with open(inputfile) as myfile:
	test_data="".join(line.rstrip() for line in myfile)

var1 = str(textstat.flesch_reading_ease(test_data))
var2 = str(textstat.smog_index(test_data))
var3 = str(textstat.flesch_kincaid_grade(test_data))
var4 = str(textstat.coleman_liau_index(test_data))
var5 = str(textstat.automated_readability_index(test_data))
var6 = str(textstat.dale_chall_readability_score(test_data))
var7 = str(textstat.difficult_words(test_data))
var8 = str(textstat.linsear_write_formula(test_data))
var9 = str(textstat.gunning_fog(test_data))
var10 = str(textstat.readability_consensus(test_data))
var11 = str(textstat.syllable_count(test_data))
var12 = str(textstat.lexicon_count(test_data, 1))
var13 = str(textstat.sentence_count(test_data))

print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
# test_data = "georges a hotel in saint john state rica. save with expedia's price guarantee."
# test_data = "suwanee oceanside princess. no best costs. great booking book a hotel in ibiza. " \
#             "great booking book a hotel in ibiza."
# test_data = "refund hotel in ewr of george area area nearby. 3-star belfast hotel in"
test_data = "great rates. book at western blue casino hotel, bangkok. no reservation costs. great"

print(
    "-------------------------Text Statistic-----------------------------------"
)
print("Returns the number of syllables present in the given text.")
print(textstat.syllable_count(test_data, lang='en_US'))
print(
    "Calculates the number of words present in the text - punctuation removed")
print(textstat.lexicon_count(test_data, removepunct=True))
print("Returns the number of sentences present in the given text.")
print(textstat.sentence_count(test_data))
print("difficult words")
print(textstat.difficult_words(test_data))
print(
    "-------------------------Readability Formula------------------------------"
)
print("The Flesch Reading Ease Score")
print(textstat.flesch_reading_ease(test_data))
print("The SMOG Index")
print("Texts of fewer than 30 sentences are statistically invalid, "
      "because the SMOG formula was normed on 30-sentence samples.")
print("textstat requires atleast 3 sentences for a result.")
print(textstat.smog_index(test_data))
print("The Flesch-Kincaid Grade")
print(textstat.flesch_kincaid_grade(test_data))
print("The Coleman-Liau Index")
def calculate_number_of_sentences(review):
    review = str(review)
    if len(review) > 0:
        return math.sqrt(math.sqrt(textstat.sentence_count(review)))
    else:
        return 0
Пример #44
0
remove_digits = str.maketrans('', '', digits)

strip_nums = str.maketrans('', '', digits)
df['text'] = df['text'].apply(lambda x: x.translate(remove_digits))

# Print pro/con text of one review.
#df.loc[6,'pros']
#df.loc[6,'cons']
#df.loc[7,'text']

# Create features for grade level, reading ease, word counts, sentence count, and paragraphs.
# Source: https://pypi.python.org/pypi/textstat/
# Note: \r = paragraph break. \n = white space.
df['read_ease_grade'] = df['text'].apply(
    lambda x: textstat.flesch_kincaid_grade(x))
df['sentence_count'] = df['text'].apply(lambda x: textstat.sentence_count(x))
df['word_count'] = df['text'].apply(lambda x: textstat.lexicon_count(x))
df['word_count_squared'] = (df['word_count'])**2
df['paragraph'] = df['pros'].apply(lambda x: x.count('\r')) + df['cons'].apply(
    lambda x: x.count('\r'))
df['text_ratio'] = (df['textLengthPro'] - df['textLengthCon']) / (
    (df['textLengthPro'] + df['textLengthCon']))

################################################################
#### Stop words, tokenize, stemming.
################################################################

#### Tokenize text.
tokenizer = RegexpTokenizer(r'\w+')
df['tokens'] = df['text'].apply(lambda x: tokenizer.tokenize(x))
Пример #45
0
def passes_sentencify(axes, fig, config_value):
    """
    lint rule for sentencify
    """
    return textstat.sentence_count(axes.get_title()) >= 1