예제 #1
0
파일: liwc_batch.py 프로젝트: wtgme/ohsn
def bunch_user_tweets_dataframe(dbname, comname, timename, filename, num_batch=2, n=-1):
    '''
    :param dbname: db name
    :param comname: user collection name
    :param timename: timeline collection name
    :param n: split tweets every n
    :return: pandas dataframe
    '''
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    times = db[timename]

    liwc_results = []
    indices = []
    user_dis = []
    user_create_time = []
    first_tweet_time = []
    last_tweet_time = []
    counts = []
    fields = []
    split_k = False
    if n == -1:
        split_k = True

    for user in com.find({'timeline_count': {'$gt': 100}}, ['id', 'id_str', 'created_at', 'timeline_count'], no_cursor_timeout=True):
        uid = user['id']
        tweet_count = user['timeline_count']
        if split_k == True:
            n = (tweet_count-1)/num_batch
            print '---------------------------------------------'
            print '%d tweets batched in every %d' %(tweet_count, n)

        count = 0
        index = 0
        tweets = []

        for tweet in times.find({'user.id': uid}).sort([("id", 1)]):
            if count < n:
                tweets.append(tweet)
                count += 1
            else:
                result = liwcp.process_tweet(tweets, Trim_rt=False)
                if result:
                    liwc_results.append([result[k] for k in result.keys()])
                    if len(fields) == 0:
                        fields = [k for k in result.keys()]
                    user_dis.append(user['id_str'])
                    indices.append(index)
                    counts.append(n)
                    user_create_time.append(datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
                    first_tweet_time.append(datetime.strptime(tweets[0]['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
                    last_tweet_time.append(datetime.strptime(tweets[-1]['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
                    print 'User %s, in time %d with %d tweets ---- verify %d tweets' %(user['id_str'], index, count, len(tweets))
                index += 1
                tweets = [tweet]
                count = 1

    liwc_results = np.array(liwc_results)
    size = len(user_create_time)
    counts = np.reshape(counts, (size, 1))
    user_create_time = np.reshape(user_create_time, (size, 1))
    first_tweet_time = np.reshape(first_tweet_time, (size, 1))
    last_tweet_time = np.reshape(last_tweet_time, (size, 1))

    user_dis = np.reshape(user_dis, (size, 1))
    indices = np.reshape(indices, (size, 1))

    user_dis = np.append(user_dis, indices, axis=1)
    user_dis = np.append(user_dis, user_create_time, axis=1)
    user_dis = np.append(user_dis, first_tweet_time, axis=1)
    user_dis = np.append(user_dis, last_tweet_time, axis=1)
    user_dis = np.append(user_dis, counts, axis=1)
    liwc_results = np.append(user_dis, liwc_results, axis=1)
    print 'user matrix', liwc_results.shape


    df = pd.DataFrame(data=liwc_results,
                      columns=['user_id', 'time_index', 'user_created_time', 'first_tweet_time', 'last_tweet_time', 'count'] + fields)
    df.to_csv(filename)
    df.to_pickle(filename+'.pick')
예제 #2
0
def bunch_user_tweets_dataframe(dbname,
                                comname,
                                timename,
                                filename,
                                num_batch=2,
                                n=-1):
    '''
    :param dbname: db name
    :param comname: user collection name
    :param timename: timeline collection name
    :param n: split tweets every n
    :return: pandas dataframe
    '''
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    times = db[timename]

    liwc_results = []
    indices = []
    user_dis = []
    user_create_time = []
    first_tweet_time = []
    last_tweet_time = []
    counts = []
    fields = []
    split_k = False
    if n == -1:
        split_k = True

    for user in com.find({'timeline_count': {
            '$gt': 0
    }}, ['id', 'id_str', 'created_at', 'timeline_count'],
                         no_cursor_timeout=True):
        uid = user['id']
        tweet_count = user['timeline_count']
        if split_k == True:
            n = (tweet_count - 1) / num_batch
            print '---------------------------------------------'
            print '%d tweets batched in every %d' % (tweet_count, n)

        count = 0
        index = 0
        tweets = []

        for tweet in times.find({'user.id': uid}).sort([("id", 1)]):
            if count < n:
                tweets.append(tweet)
                count += 1
            else:
                result = liwcp.process_tweet(tweets, Trim_rt=True)
                if result:
                    liwc_results.append([result[k] for k in result.keys()])
                    if len(fields) == 0:
                        fields = [k for k in result.keys()]
                    user_dis.append(user['id_str'])
                    indices.append(index)
                    counts.append(n)
                    user_create_time.append(
                        datetime.strptime(user['created_at'],
                                          '%a %b %d %H:%M:%S +0000 %Y'))
                    first_tweet_time.append(
                        datetime.strptime(tweets[0]['created_at'],
                                          '%a %b %d %H:%M:%S +0000 %Y'))
                    last_tweet_time.append(
                        datetime.strptime(tweets[-1]['created_at'],
                                          '%a %b %d %H:%M:%S +0000 %Y'))
                    print 'User %s, in time %d with %d tweets ---- verify %d tweets' % (
                        user['id_str'], index, count, len(tweets))
                index += 1
                tweets = [tweet]
                count = 1

    liwc_results = np.array(liwc_results)
    size = len(user_create_time)
    counts = np.reshape(counts, (size, 1))
    user_create_time = np.reshape(user_create_time, (size, 1))
    first_tweet_time = np.reshape(first_tweet_time, (size, 1))
    last_tweet_time = np.reshape(last_tweet_time, (size, 1))

    user_dis = np.reshape(user_dis, (size, 1))
    indices = np.reshape(indices, (size, 1))

    user_dis = np.append(user_dis, indices, axis=1)
    user_dis = np.append(user_dis, user_create_time, axis=1)
    user_dis = np.append(user_dis, first_tweet_time, axis=1)
    user_dis = np.append(user_dis, last_tweet_time, axis=1)
    user_dis = np.append(user_dis, counts, axis=1)
    liwc_results = np.append(user_dis, liwc_results, axis=1)
    print 'user matrix', liwc_results.shape

    df = pd.DataFrame(data=liwc_results,
                      columns=[
                          'user_id', 'time_index', 'user_created_time',
                          'first_tweet_time', 'last_tweet_time', 'count'
                      ] + fields)
    df.to_csv(filename)
    df.to_pickle(filename + '.pick')
예제 #3
0
파일: liwc_batch.py 프로젝트: wtgme/ohsn
def bunch_user_tweets_panel(dbname, comname, timename, n=100):
    '''
    :param dbname: db name
    :param comname: user collection name
    :param timename: timeline collection name
    :param n: split tweets every n
    :return: pandas panel
    '''
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    times = db[timename]

    data = {}

    for user in com.find({'timeline_count': {'$gt': 500}}, ['id', 'id_str', 'created_at']):
        uid = user['id']

        liwc_results = []
        indices = []
        user_create_time = []
        first_tweet_time = []
        last_tweet_time = []
        fields = []

        count = 0
        index = 0
        tweets = []

        for tweet in times.find({'user.id': uid}).sort([("id", 1)]):
            if count < n:
                tweets.append(tweet)
                count += 1
            else:
                result = liwcp.process_tweet(tweets, Trim_rt=False)
                if result:
                    liwc_results.append([result[k] for k in result.keys()])
                    if len(fields) == 0:
                        fields = [k for k in result.keys()]
                    indices.append(index)
                    user_create_time.append(datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
                    first_tweet_time.append(datetime.strptime(tweets[0]['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
                    last_tweet_time.append(datetime.strptime(tweets[-1]['created_at'], '%a %b %d %H:%M:%S +0000 %Y'))
                    # print index, count
                index += 1
                count = 0
                tweets = []
        liwc_results = np.array(liwc_results)
        size = len(user_create_time)
        user_create_time = np.reshape(user_create_time, (size, 1))
        first_tweet_time = np.reshape(first_tweet_time, (size, 1))
        last_tweet_time = np.reshape(last_tweet_time, (size, 1))
        liwc_results = np.append(liwc_results, user_create_time, axis=1)
        liwc_results = np.append(liwc_results, first_tweet_time, axis=1)
        liwc_results = np.append(liwc_results, last_tweet_time, axis=1)
        print liwc_results.shape


        df = pd.DataFrame(data=liwc_results,
                          columns=fields + ['user_created_time', 'first_tweet_time', 'last_tweet_time'],
                          index=indices)
        data[user['id_str']] = df
    pn = pd.Panel(data)
    pn.to_pickle('ed-liwc.panel')
예제 #4
0
def bunch_user_tweets_panel(dbname, comname, timename, n=100):
    '''
    :param dbname: db name
    :param comname: user collection name
    :param timename: timeline collection name
    :param n: split tweets every n
    :return: pandas panel
    '''
    db = dbt.db_connect_no_auth(dbname)
    com = db[comname]
    times = db[timename]

    data = {}

    for user in com.find({'timeline_count': {
            '$gt': 500
    }}, ['id', 'id_str', 'created_at']):
        uid = user['id']

        liwc_results = []
        indices = []
        user_create_time = []
        first_tweet_time = []
        last_tweet_time = []
        fields = []

        count = 0
        index = 0
        tweets = []

        for tweet in times.find({'user.id': uid}).sort([("id", 1)]):
            if count < n:
                tweets.append(tweet)
                count += 1
            else:
                result = liwcp.process_tweet(tweets, Trim_rt=False)
                if result:
                    liwc_results.append([result[k] for k in result.keys()])
                    if len(fields) == 0:
                        fields = [k for k in result.keys()]
                    indices.append(index)
                    user_create_time.append(
                        datetime.strptime(user['created_at'],
                                          '%a %b %d %H:%M:%S +0000 %Y'))
                    first_tweet_time.append(
                        datetime.strptime(tweets[0]['created_at'],
                                          '%a %b %d %H:%M:%S +0000 %Y'))
                    last_tweet_time.append(
                        datetime.strptime(tweets[-1]['created_at'],
                                          '%a %b %d %H:%M:%S +0000 %Y'))
                    # print index, count
                index += 1
                count = 0
                tweets = []
        liwc_results = np.array(liwc_results)
        size = len(user_create_time)
        user_create_time = np.reshape(user_create_time, (size, 1))
        first_tweet_time = np.reshape(first_tweet_time, (size, 1))
        last_tweet_time = np.reshape(last_tweet_time, (size, 1))
        liwc_results = np.append(liwc_results, user_create_time, axis=1)
        liwc_results = np.append(liwc_results, first_tweet_time, axis=1)
        liwc_results = np.append(liwc_results, last_tweet_time, axis=1)
        print liwc_results.shape

        df = pd.DataFrame(
            data=liwc_results,
            columns=fields +
            ['user_created_time', 'first_tweet_time', 'last_tweet_time'],
            index=indices)
        data[user['id_str']] = df
    pn = pd.Panel(data)
    pn.to_pickle('ed-liwc.panel')