def liwc_correlation(str1, str2, _name1, _name2): liwc_lexicon = Liwc() #Prepare the data to analysis full_counts1 = liwc_lexicon.summarize_document(str1) full_counts2 = liwc_lexicon.summarize_document(str2) #Calculate Correlation liwc_lexicon.print_analysis(full_counts1,full_counts2, _name1, _name2)
def liwc_results(str, _name): liwc_lexicon = Liwc() #Count the number of words in each Category category_counts = Counter(liwc_lexicon.read_document(str)) print 'Basic category counts: {}'.format(category_counts) #Calculate the score for each category full_counts = liwc_lexicon.summarize_document(str) liwc_lexicon.print_summarization(full_counts, _name)
def process(poi, timelines): poi.update( {}, {'$set': { "rliwc_anal.mined": False, "rliwc_anal.result": None }}, multi=True) poi.create_index([('timeline_auth_error_flag', pymongo.ASCENDING), ('rliwc_anal.mined', pymongo.ASCENDING)]) while True: # How many users whose timelines have not been processed by LIWC count = poi.find({ 'timeline_auth_error_flag': False, "rliwc_anal.mined": False }).count() if count == 0: break else: print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining" for user in poi.find({ 'timeline_auth_error_flag': False, "rliwc_anal.mined": False }).limit(250): liwc = Liwc() textmass = "" for tweet in timelines.find({'user.id': user['id']}): # is it a retweet? # if not ('retweeted_status' in tweet): text = tweet['text'].encode('utf8') # text = re.sub(r"http\S+", "", text) # this doesn't do anything textmass = textmass + " " + text textmass = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", textmass).split()) textmass.lower() result = Liwc.summarize_document(liwc, textmass) # print result poi.update({'id': user['id']}, { '$set': { "rliwc_anal.mined": True, "rliwc_anal.result": result } })
def process(poi, timelines): poi.update({},{'$set':{"rliwc_anal.mined": False, "rliwc_anal.result": None}}, multi=True) poi.create_index([('timeline_auth_error_flag', pymongo.ASCENDING), ('rliwc_anal.mined', pymongo.ASCENDING)]) while True: # How many users whose timelines have not been processed by LIWC count = poi.find({'timeline_auth_error_flag':False, "rliwc_anal.mined": False}).count() if count == 0: break else: print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining" for user in poi.find({'timeline_auth_error_flag':False, "rliwc_anal.mined": False}).limit(250): liwc = Liwc() textmass = "" for tweet in timelines.find({'user.id': user['id']}): # is it a retweet? # if not ('retweeted_status' in tweet): text = tweet['text'].encode('utf8') # text = re.sub(r"http\S+", "", text) # this doesn't do anything textmass = textmass + " " + text textmass = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", textmass).split()) textmass.lower() result = Liwc.summarize_document(liwc, textmass) # print result poi.update({'id': user['id']}, {'$set': {"rliwc_anal.mined": True, "rliwc_anal.result": result}})
# -*- coding: utf-8 -*- """ Created on 11:55 AM, 11/4/15 @author: wt """ from collections import Counter from lexicons.liwc import Liwc liwc_lexicon = Liwc() gettysburg = '''Four score and seven years ago our fathers brought forth on this continent a new nation, conceived in liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battlefield of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.''' # read_document() is a generator, but Counter will consume the whole thing # category_counts = Counter(liwc_lexicon.read_document(gettysburg)) # print 'Basic category counts: {}'.format(category_counts) # print out a tabulation that looks like the LIWC software's text report full_counts = liwc_lexicon.summarize_document(gettysburg) print full_counts # liwc_lexicon.print_summarization(full_counts) print Liwc.summarize_document(liwc_lexicon, gettysburg) print liwc_lexicon.summarize_document( '''Four score and seven years ago our fathers brought forth on this continent a new nation''')
print "TODO: create your own ED dictionary..." print "TODO: upload the LIWC analysis result to the user prof entry? No LIWC should be done on a static set" for username in usernames: results = tweets.find({'user.screen_name': username}) textmass = "" for result in results: textmass = textmass + " " + result['text'].encode('utf8') #textmass = unicode( textmass, errors='ignore') textmass = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",textmass).split()) # outfile = open(username + ".txt","w") # outfile.write(textmass) # outfile.close() textmass.lower() result = Liwc.summarize_document(liwc, textmass) Liwc.print_summarization(liwc, result) #infilename = "all_text_Content.txt" #infilename = "lincoln.txt" # #infile = open(infilename,"r") #sample = "" # #for line in infile: # sample = sample + " " + line # #liwc = Liwc() #result = Liwc.summarize_document(liwc, sample) #Liwc.print_summarization(liwc, result)
}).count() if count == 0: break else: print datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining" for user in poi.find({ 'timeline_auth_error_flag': False, "liwc_anal.mined": False }).limit(250): #progcounter += 1 #if progcounter%1000 == 0: # print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + str(progcounter) liwc = Liwc() textmass = "" for tweet in timelines.find({'user.id': user['id']}): # is it a retweet? #if not ('retweeted_status' in tweet): text = tweet['text'].encode('utf8') # text = re.sub(r"http\S+", "", text) # this doesn't do anything textmass = textmass + " " + text # print tweet['text'].encode('utf8') textmass = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", textmass).split()) textmass.lower() result = Liwc.summarize_document(liwc, textmass)
if count == 0: break else: print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") +"\t"+ str(count) + " remaining" for user in poi.find({'timeline_auth_error_flag':False, "liwc_anal.mined": False}).limit(250): #progcounter += 1 #if progcounter%1000 == 0: # print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + str(progcounter) liwc = Liwc() textmass = "" for tweet in timelines.find({'user.id': user['id']}): # is it a retweet? #if not ('retweeted_status' in tweet): text = tweet['text'].encode('utf8') # text = re.sub(r"http\S+", "", text) # this doesn't do anything textmass = textmass + " " + text # print tweet['text'].encode('utf8') textmass = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",textmass).split()) textmass.lower() result = Liwc.summarize_document(liwc, textmass) #print result #exit() # Liwc.print_summarization(liwc, result) poi.update({'id':user['id']},{'$set':{"liwc_anal.mined": True, "liwc_anal.result":result}})
# -*- coding: utf-8 -*- """ Created on 11:55 AM, 11/4/15 @author: wt """ from collections import Counter from lexicons.liwc import Liwc liwc_lexicon = Liwc() gettysburg = """Four score and seven years ago our fathers brought forth on this continent a new nation, conceived in liberty, and dedicated to the proposition that all men are created equal. Now we are engaged in a great civil war, testing whether that nation, or any nation so conceived and so dedicated, can long endure. We are met on a great battlefield of that war. We have come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation might live. It is altogether fitting and proper that we should do this.""" # read_document() is a generator, but Counter will consume the whole thing # category_counts = Counter(liwc_lexicon.read_document(gettysburg)) # print 'Basic category counts: {}'.format(category_counts) # print out a tabulation that looks like the LIWC software's text report full_counts = liwc_lexicon.summarize_document(gettysburg) print full_counts # liwc_lexicon.print_summarization(full_counts) print Liwc.summarize_document(liwc_lexicon, gettysburg) print liwc_lexicon.summarize_document( """Four score and seven years ago our fathers brought forth on this continent a new nation"""