def process(poi, timelines):
    poi.update(
        {}, {'$set': {
            "rliwc_anal.mined": False,
            "rliwc_anal.result": None
        }},
        multi=True)
    poi.create_index([('timeline_auth_error_flag', pymongo.ASCENDING),
                      ('rliwc_anal.mined', pymongo.ASCENDING)])

    while True:
        # How many users whose timelines have not been processed by LIWC
        count = poi.find({
            'timeline_auth_error_flag': False,
            "rliwc_anal.mined": False
        }).count()
        if count == 0:
            break
        else:
            print datetime.datetime.now().strftime(
                "%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining"

        for user in poi.find({
                'timeline_auth_error_flag': False,
                "rliwc_anal.mined": False
        }).limit(250):
            liwc = Liwc()
            textmass = ""
            for tweet in timelines.find({'user.id': user['id']}):
                # is it a retweet?
                # if not ('retweeted_status' in tweet):
                text = tweet['text'].encode('utf8')
                # text = re.sub(r"http\S+", "", text) # this doesn't do anything
                textmass = textmass + " " + text

            textmass = ' '.join(
                re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
                       textmass).split())
            textmass.lower()
            result = Liwc.summarize_document(liwc, textmass)
            # print result

            poi.update({'id': user['id']}, {
                '$set': {
                    "rliwc_anal.mined": True,
                    "rliwc_anal.result": result
                }
            })
Exemplo n.º 2
0
# -*- coding: utf-8 -*-
"""
Created on 11:55 AM, 11/4/15

@author: wt

"""

from collections import Counter
from lexicons.liwc import Liwc
liwc_lexicon = Liwc()
gettysburg = '''Four score and seven years ago our fathers brought forth on
  this continent a new nation, conceived in liberty, and dedicated to the
  proposition that all men are created equal. Now we are engaged in a great
  civil war, testing whether that nation, or any nation so conceived and so
  dedicated, can long endure. We are met on a great battlefield of that war.
  We have come to dedicate a portion of that field, as a final resting place
  for those who here gave their lives that that nation might live. It is
  altogether fitting and proper that we should do this.'''
# read_document() is a generator, but Counter will consume the whole thing
# category_counts = Counter(liwc_lexicon.read_document(gettysburg))
# print 'Basic category counts: {}'.format(category_counts)
# print out a tabulation that looks like the LIWC software's text report
full_counts = liwc_lexicon.summarize_document(gettysburg)
print full_counts
# liwc_lexicon.print_summarization(full_counts)
print Liwc.summarize_document(liwc_lexicon, gettysburg)
print liwc_lexicon.summarize_document(
    '''Four score and seven years ago our fathers brought forth on
  this continent a new nation''')
Exemplo n.º 3
0
    }).count()
    if count == 0:
        break
    else:
        print datetime.datetime.now().strftime(
            "%Y-%m-%d-%H-%M-%S") + "\t" + str(count) + " remaining"

    for user in poi.find({
            'timeline_auth_error_flag': False,
            "liwc_anal.mined": False
    }).limit(250):
        #progcounter += 1
        #if progcounter%1000 == 0:
        #    print datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")  + "\t" + str(progcounter)

        liwc = Liwc()
        textmass = ""

        for tweet in timelines.find({'user.id': user['id']}):
            # is it a retweet?
            #if not ('retweeted_status' in tweet):
            text = tweet['text'].encode('utf8')
            # text = re.sub(r"http\S+", "", text) # this doesn't do anything
            textmass = textmass + " " + text
            # print tweet['text'].encode('utf8')

        textmass = ' '.join(
            re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
                   textmass).split())
        textmass.lower()
        result = Liwc.summarize_document(liwc, textmass)