Exemplo n.º 1
0
def liwc_correlation(str1, str2, _name1, _name2):
    liwc_lexicon = Liwc()
    #Prepare the data to analysis
    full_counts1 = liwc_lexicon.summarize_document(str1)
    full_counts2 = liwc_lexicon.summarize_document(str2)
    #Calculate Correlation
    liwc_lexicon.print_analysis(full_counts1,full_counts2, _name1, _name2)
Exemplo n.º 2
0
def liwc_results(str, _name):
    liwc_lexicon = Liwc()
    #Count the number of words in each Category
    category_counts = Counter(liwc_lexicon.read_document(str))
    print 'Basic category counts: {}'.format(category_counts)
    #Calculate the score for each category
    full_counts = liwc_lexicon.summarize_document(str)
    liwc_lexicon.print_summarization(full_counts, _name)
def process(poi, timelines):
    poi.update(
        {}, {'$set': {
            "rliwc_anal.mined": False,
            "rliwc_anal.result": None
        }},
        multi=True)
    poi.create_index([('timeline_auth_error_flag', pymongo.ASCENDING),
                      ('rliwc_anal.mined', pymongo.ASCENDING)])

    while True:
        # How many users whose timelines have not been processed by LIWC
        count = poi.find({
            'timeline_auth_error_flag': False,
            "rliwc_anal.mined": False
        }).count()
        if count == 0:
            break
        else:
            print datetime.datetime.now().strftime(
                "%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining"

        for user in poi.find({
                'timeline_auth_error_flag': False,
                "rliwc_anal.mined": False
        }).limit(250):
            liwc = Liwc()
            textmass = ""
            for tweet in timelines.find({'user.id': user['id']}):
                # is it a retweet?
                # if not ('retweeted_status' in tweet):
                text = tweet['text'].encode('utf8')
                # text = re.sub(r"http\S+", "", text) # this doesn't do anything
                textmass = textmass + " " + text

            textmass = ' '.join(
                re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
                       textmass).split())
            textmass.lower()
            result = Liwc.summarize_document(liwc, textmass)
            # print result

            poi.update({'id': user['id']}, {
                '$set': {
                    "rliwc_anal.mined": True,
                    "rliwc_anal.result": result
                }
            })
Exemplo n.º 4
0
def process(poi, timelines):
    poi.update({},{'$set':{"rliwc_anal.mined": False, "rliwc_anal.result": None}}, multi=True)
    poi.create_index([('timeline_auth_error_flag', pymongo.ASCENDING),
                      ('rliwc_anal.mined', pymongo.ASCENDING)])

    while True:
        # How many users whose timelines have not been processed by LIWC
        count = poi.find({'timeline_auth_error_flag':False, "rliwc_anal.mined": False}).count()
        if count == 0:
            break
        else:
            print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining"

        for user in poi.find({'timeline_auth_error_flag':False, "rliwc_anal.mined": False}).limit(250):
            liwc = Liwc()
            textmass = ""
            for tweet in timelines.find({'user.id': user['id']}):
                # is it a retweet?
                # if not ('retweeted_status' in tweet):
                text = tweet['text'].encode('utf8')
                # text = re.sub(r"http\S+", "", text) # this doesn't do anything
                textmass = textmass + " " + text

            textmass = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", textmass).split())
            textmass.lower()
            result = Liwc.summarize_document(liwc, textmass)
            # print result

            poi.update({'id': user['id']}, {'$set': {"rliwc_anal.mined": True, "rliwc_anal.result": result}})
Exemplo n.º 5
0
# -*- coding: utf-8 -*-
"""
Created on 11:55 AM, 11/4/15

@author: wt

"""

from collections import Counter
from lexicons.liwc import Liwc
liwc_lexicon = Liwc()
gettysburg = '''Four score and seven years ago our fathers brought forth on
  this continent a new nation, conceived in liberty, and dedicated to the
  proposition that all men are created equal. Now we are engaged in a great
  civil war, testing whether that nation, or any nation so conceived and so
  dedicated, can long endure. We are met on a great battlefield of that war.
  We have come to dedicate a portion of that field, as a final resting place
  for those who here gave their lives that that nation might live. It is
  altogether fitting and proper that we should do this.'''
# read_document() is a generator, but Counter will consume the whole thing
# category_counts = Counter(liwc_lexicon.read_document(gettysburg))
# print 'Basic category counts: {}'.format(category_counts)
# print out a tabulation that looks like the LIWC software's text report
full_counts = liwc_lexicon.summarize_document(gettysburg)
print full_counts
# liwc_lexicon.print_summarization(full_counts)
print Liwc.summarize_document(liwc_lexicon, gettysburg)
print liwc_lexicon.summarize_document(
    '''Four score and seven years ago our fathers brought forth on
  this continent a new nation''')
Exemplo n.º 6
0
print "TODO: create your own ED dictionary..."
print "TODO: upload the LIWC analysis result to the user prof entry? No LIWC should be done on a static set"

for username in usernames:
    results = tweets.find({'user.screen_name': username})
    textmass = ""
    for result in results:
        textmass = textmass + " " + result['text'].encode('utf8')
    
    #textmass = unicode( textmass, errors='ignore')
    textmass = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",textmass).split())
#    outfile = open(username + ".txt","w")
#    outfile.write(textmass)
#    outfile.close()
    textmass.lower()
    result = Liwc.summarize_document(liwc, textmass)
    Liwc.print_summarization(liwc, result)
    
     
#infilename = "all_text_Content.txt"
#infilename = "lincoln.txt"
#
#infile = open(infilename,"r")
#sample = ""
#
#for line in infile:
#    sample = sample  + " " + line
#
#liwc = Liwc()
#result = Liwc.summarize_document(liwc, sample)
#Liwc.print_summarization(liwc, result)     
Exemplo n.º 7
0
    }).count()
    if count == 0:
        break
    else:
        print datetime.datetime.now().strftime(
            "%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining"

    for user in poi.find({
            'timeline_auth_error_flag': False,
            "liwc_anal.mined": False
    }).limit(250):
        #progcounter += 1
        #if progcounter%1000 == 0:
        #    print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")  + "\t" + str(progcounter)

        liwc = Liwc()
        textmass = ""

        for tweet in timelines.find({'user.id': user['id']}):
            # is it a retweet?
            #if not ('retweeted_status' in tweet):
            text = tweet['text'].encode('utf8')
            # text = re.sub(r"http\S+", "", text) # this doesn't do anything
            textmass = textmass + " " + text
            # print tweet['text'].encode('utf8')

        textmass = ' '.join(
            re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
                   textmass).split())
        textmass.lower()
        result = Liwc.summarize_document(liwc, textmass)
Exemplo n.º 8
0
    if count == 0:
        break
    else:
	    print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") +"\t"+ str(count) + " remaining"
	
    for user in poi.find({'timeline_auth_error_flag':False, "liwc_anal.mined": False}).limit(250):
        #progcounter += 1
        #if progcounter%1000 == 0:
        #    print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")  + "\t" + str(progcounter)
        
	liwc = Liwc()
	textmass = ""
	
        for tweet in timelines.find({'user.id': user['id']}):
	    # is it a retweet?
 	    #if not ('retweeted_status' in tweet):
		text = tweet['text'].encode('utf8')
		# text = re.sub(r"http\S+", "", text) # this doesn't do anything
		textmass = textmass + " " + text
		# print tweet['text'].encode('utf8')

	textmass = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",textmass).split())
	textmass.lower()
	result = Liwc.summarize_document(liwc, textmass)
	#print result    	
	#exit()
	# Liwc.print_summarization(liwc, result)
        
	poi.update({'id':user['id']},{'$set':{"liwc_anal.mined": True, "liwc_anal.result":result}})
                
Exemplo n.º 9
0
# -*- coding: utf-8 -*-
"""
Created on 11:55 AM, 11/4/15

@author: wt

"""

from collections import Counter
from lexicons.liwc import Liwc

liwc_lexicon = Liwc()
gettysburg = """Four score and seven years ago our fathers brought forth on
  this continent a new nation, conceived in liberty, and dedicated to the
  proposition that all men are created equal. Now we are engaged in a great
  civil war, testing whether that nation, or any nation so conceived and so
  dedicated, can long endure. We are met on a great battlefield of that war.
  We have come to dedicate a portion of that field, as a final resting place
  for those who here gave their lives that that nation might live. It is
  altogether fitting and proper that we should do this."""
# read_document() is a generator, but Counter will consume the whole thing
# category_counts = Counter(liwc_lexicon.read_document(gettysburg))
# print 'Basic category counts: {}'.format(category_counts)
# print out a tabulation that looks like the LIWC software's text report
full_counts = liwc_lexicon.summarize_document(gettysburg)
print full_counts
# liwc_lexicon.print_summarization(full_counts)
print Liwc.summarize_document(liwc_lexicon, gettysburg)
print liwc_lexicon.summarize_document(
    """Four score and seven years ago our fathers brought forth on
  this continent a new nation"""