Exemplo n.º 1
0
def friend_network_hashtag_weight(dbname, netname):
    '''
    Community detection on friendship network, weighted by hashtag similarity
    :param dbname:
    :param netname:
    :param user_hash_profile:
    :return:
    '''
    user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r'))
    net = gt.load_network(dbname, netname)
    fields = iot.read_fields()
    com = dbt.db_connect_col(dbname, 'scom')
    for edge in net.es:
        source_vertex_id = edge.source
        target_vertex_id = edge.target
        source_uid = int(net.vs[source_vertex_id]['name'])
        target_uid = int(net.vs[target_vertex_id]['name'])
        source_user = com.find_one({'id':source_uid})
        target_user = com.find_one({'id':target_uid})
        source_user_liwc = iot.get_fields_one_doc(source_user, fields)
        target_user_liwc = iot.get_fields_one_doc(target_user, fields)
        source_user_liwc.extend(user_hash_profile[source_uid])
        target_user_liwc.extend(user_hash_profile[target_uid])
        print len(target_user_liwc)
        dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc)
        edge['weight'] = 1.0/(1.0 + dis)
    net.write_graphml('ed_weighted_follow.graphml')
Exemplo n.º 2
0
def friend_network_hashtag_weight(dbname, netname):
    '''
    Community detection on friendship network, weighted by hashtag similarity
    :param dbname:
    :param netname:
    :param user_hash_profile:
    :return:
    '''
    user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r'))
    net = gt.load_network(dbname, netname)
    fields = iot.read_fields()
    com = dbt.db_connect_col(dbname, 'scom')
    for edge in net.es:
        source_vertex_id = edge.source
        target_vertex_id = edge.target
        source_uid = int(net.vs[source_vertex_id]['name'])
        target_uid = int(net.vs[target_vertex_id]['name'])
        source_user = com.find_one({'id': source_uid})
        target_user = com.find_one({'id': target_uid})
        source_user_liwc = iot.get_fields_one_doc(source_user, fields)
        target_user_liwc = iot.get_fields_one_doc(target_user, fields)
        source_user_liwc.extend(user_hash_profile[source_uid])
        target_user_liwc.extend(user_hash_profile[target_uid])
        print len(target_user_liwc)
        dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc)
        edge['weight'] = 1.0 / (1.0 + dis)
    net.write_graphml('ed_weighted_follow.graphml')
Exemplo n.º 3
0
def read_user_time(filename):
    fields = iot.read_fields()
    trimed_fields = [field.split('.')[-1] for field in fields]
    groups = [
         ('ED', 'fed', 'com', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}),
         ('RD', 'random', 'scom', {'liwc_anal.result.WC': {'$exists': True}}),
         ('YG', 'younger', 'scom', {'liwc_anal.result.WC': {'$exists': True}})
    ]

    data = []
    for tag, dbname, comname, filter_values in groups:
        com = dbt.db_connect_col(dbname, comname)

        for user in com.find(filter_values, no_cursor_timeout=True):
            if 'status' in user:
                created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                scraped_at = user['scrape_timeline_at']
                last_post = datetime.strptime(user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                life_time = diff_day(last_post, created_at)
                average_time = float(life_time)/min(1, user['statuses_count'])
                longest_tweet_intervalb = user['longest_tweet_interval']

                observation_interval = diff_day(scraped_at, last_post)
                if (observation_interval-longest_tweet_intervalb) > 30:
                    death = 1
                else:
                    death = 0
                values = iot.get_fields_one_doc(user, fields)
                data.append([user['id_str'], created_at, last_post, scraped_at, average_time,
                             longest_tweet_intervalb, observation_interval, tag, death] + values)

    df = pd.DataFrame(data, columns=['uid', 'created_at', 'last_post', 'scraped_at', 'average_time',
                                     'longest_time_interval', 'observation_interval', 'group',
                                     'event'] + trimed_fields)
    df.to_csv(filename)
Exemplo n.º 4
0
def bio_all_in_line(dbname='TwitterProAna', colname='bio'):
    names = ['cbmi', 'gbmi', 'a', 'gender', 'h', 'cw', 'gw', 'lw', 'hw', 'ugw']
    fields = ['id','date'] + [name+'.value' for name in names]
    data = []
    bio = dbt.db_connect_col(dbname, colname)
    for entry in bio.find({}):
        dat = (iot.get_fields_one_doc(entry, fields))
        if set(dat[3:]) != set([0.0]):
            data.append(dat)
    df = pd.DataFrame(data=data, columns=['id','date'] +names)
    df.to_csv('ian-all'+'.csv')
Exemplo n.º 5
0
def bmi_regreesion(dbname, colname, filename):
    # regress bmi with features
    fields = iot.read_fields()
    poi_fields = fields[-9:-1]
    print poi_fields
    trimed_fields = [(field.split('.')[-1]) for field in fields]
    trimed_fields[-10:] = [
        'sentiment', 'age', 'gender', 'height', 'cw', 'gw', 'cbmi', 'gbmi',
        'edword', 'level'
    ]

    com = dbutil.db_connect_col(dbname, colname)
    data = []
    # for user in com.find({'$or': [{'text_anal.cbmi.value': {'$exists': True}},
    #                               {'text_anal.gbmi.value': {'$exists': True}}],
    #                       'liwc_anal.result.WC': {'$exists': True}}, no_cursor_timeout=True):
    com2 = dbutil.db_connect_col('fed2', colname)
    com3 = dbutil.db_connect_col('fed3', colname)
    for user in com.find({'liwc_anal.result.WC': {
            '$exists': True
    }},
                         no_cursor_timeout=True):
        values = iot.get_fields_one_doc(user, fields)
        user2 = com2.find_one({'id': user['id']})
        if user2:
            values.extend(iot.get_fields_one_doc(user2, poi_fields))
        else:
            values.extend([0] * len(poi_fields))
        user3 = com3.find_one({'id': user['id']})
        if user3:
            values.extend(iot.get_fields_one_doc(user3, poi_fields))
        else:
            values.extend([0] * len(poi_fields))
        data.append(values)
    df = pd.DataFrame(data,
                      columns=trimed_fields + [(field.split('.')[-2] + '_p2')
                                               for field in poi_fields] +
                      [(field.split('.')[-2] + '_p3') for field in poi_fields])
    df.to_csv(filename)
Exemplo n.º 6
0
def color_classify(userlabels, field_names, file_name, dbname):
    fw = open(file_name + '.data', 'w')
    db = dbt.db_connect_no_auth(dbname)
    poi = db['com']
    # format: 6,7,11,12 1:-0.022711 2:-0.050504 3:-0.035691
    for uid in userlabels.keys():
        labels = (userlabels[uid])
        user = poi.find_one({'id': uid}, ['liwc_anal.result'])
        values = io.get_fields_one_doc(user, field_names)
        outstr = ','.join(str(x) for x in labels)
        outstr += ' '
        for i in xrange(len(values)):
            outstr += str(i + 1) + ':' + str(values[i]) + ' '
        fw.write(outstr + '\n')
    fw.close()
Exemplo n.º 7
0
def feature_output(field_names,
                   file_name,
                   dbname,
                   colname,
                   label=None,
                   outids=False,
                   userset=[],
                   extend_features={}):
    fw = open(file_name + '.data', 'w')
    poi = dbt.db_connect_col(dbname, colname)
    index = 0
    maxsize = 10000000000000000
    uids = list()
    # exclude_set = set([4319191638L, 2627223434L, 2976822286L, 4788248335L, 3289264086L, 520847919, 439647015, 947539758, 617442479, 2481703728L, 2913311029L, 3760687289L, 2303011905L, 1712561862, 2882255303L, 261549132, 982895821, 2849269327L, 312684498, 160044558, 774072534, 330611545, 430569947, 1275228253, 3399616094L, 2924322143L, 457692129, 3006221026L, 2837359399L, 18942418, 2848241137L, 273768180, 235857269, 3315086840L])

    for x in poi.find({
            # 'text_anal.edword_count.value': {'$gt': 0},
            # 'id': {'$in': userset},
            'liwc_anal.result.WC': {
                '$exists': True
            },
            # 'text_anal.gbmi': {'$exists': True},
            # 'timeline_count': {'$gt': 100},
            # 'level': {'$gt': 1}
    }):
        # if index < maxsize and int(x['id']) not in exclude_set:
        if index < maxsize:
            uid = int(x['id'])
            uids.append(uid)
            values = io.get_fields_one_doc(x, field_names)
            # if uid in extend_features:
            #     values.extend(extend_features[uid])
            if label:
                outstr = label + ' '
            else:
                outstr = x['id_str'] + ' '
            for i in xrange(len(values)):
                outstr += str(i + 1) + ':' + str(values[i]) + ' '
            index += 1
            fw.write(outstr + '\n')
    fw.close()
    print len(uids)
    if outids:
        pickle.dump(uids, open(file_name + '_ids.data', 'w'))
Exemplo n.º 8
0
def out_data():
    control = dbt.db_connect_col('fed', 'control_com')
    treat = dbt.db_connect_col('fed', 'treat_com')
    control_user = iot.get_values_one_field('fed', 'control_com', 'id', {'prior_liwc.result.WC':{'$exists': True},
                                                                'post_liwc.result.WC':{'$exists': True}})
    treat_user = iot.get_values_one_field('fed', 'treat_com', 'id', {'prior_liwc.result.WC':{'$exists': True},
                                                                'post_liwc.result.WC':{'$exists': True}})
    data = []
    fields = iot.read_fields()
    prefix = ['prior_liwc', 'post_liwc']
    for i in xrange(2):
        uids = [control_user, treat_user][i]
        for uid in uids:
            user = [control, treat][i].find_one({'id': uid})
            for j in xrange(2):
                fields_new = ['id_str']+[field.replace('liwc_anal', prefix[j]) for field in fields]
                values = iot.get_fields_one_doc(user, fields_new)
                data.append(values+[i, j])

    df = pd.DataFrame(data, columns=['id']+[field.split('.')[-1] for field in fields]+['treated', 'time'])

    df.to_csv('treatment.csv')
Exemplo n.º 9
0
def emotion_recovery_IV_following(dbname1, dbname2, comname1, comname2):
    '''
    Only use following stats
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''
    print 'load liwc 2 batches'
    df = pd.read_pickle('ed-liwc2stage.csv'+'.pick')
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}}
    user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)
    com1 = dbt.db_connect_col(dbname1, comname1)
    com2 = dbt.db_connect_col(dbname2, comname2)
    fields = ['liwc_anal.result.posemo',
              'liwc_anal.result.negemo',
              'liwc_anal.result.ingest',
              'liwc_anal.result.bio',
              'liwc_anal.result.body',
              'liwc_anal.result.health',
              'liwc_anal.result.death'
              # 'liwc_anal.result.anx',
              # 'liwc_anal.result.anger',
              # 'liwc_anal.result.sad'
              ]
    trimed_fields = [field.split('.')[-1] for field in fields]
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days', 'eigenvector']
    attr_names = ['uid', 'attr', 'u_timeline_count_2p']
    attr_names.extend(['u_'+field for field in trimed_fields])
    attr_names.extend(['u_prior_'+field for field in trimed_fields])
    attr_names.extend(['u_post_'+field for field in trimed_fields])
    attr_names.extend(['u_change_'+field for field in trimed_fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend(['u_recovery_tweets', 'u_timeline_count'])
    attr_names.extend(['f_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['f_'+field for field in prof_names])
    attr_names.extend(['f_num', 'f_palive'])
    print attr_names


    data = []
    for uid in user1:
        # set uid
        row = [uid]
        # set attrition states
        u1 = com1.find_one({'id': uid})
        u2 = com2.find_one({'id': uid})
        if u2 is None or u2['timeline_count'] == 0:
            row.extend([None]*2)
        else:
            row.extend([u2['recovery_tweets'], u2['timeline_count']])
        # set users liwc feature
        uatt = iot.get_fields_one_doc(u1, fields)
        row.extend(uatt)
        # set users liwc changes
        uvs = df[df.user_id == str(uid)].loc[:, trimed_fields]
        # print uvs
        if len(uvs) == 2:
            changes, priors, posts = [], [], []
            for name in trimed_fields:
                old = uvs.iloc[0][name]
                new = uvs.iloc[1][name]
                priors.append(old)
                posts.append(new)
                changes.append(new - old)
            row.extend(priors)
            row.extend(posts)
            row.extend(changes)
        else:
            row.extend([None]*(len(trimed_fields)*3))

        # set profile, active days and eigenvector centrality
        row.extend(active_days(u1))
        row.extend([eigen_map.get(u1['id'])])
        row.extend([u1['recovery_tweets'], u1['timeline_count']])

        exist = True
        try:
            v = network1.vs.find(name=str(uid))
        except ValueError:
            exist = False
        if exist:
            # friends = set(network1.neighbors(str(uid))) # id or name
            friends = set(network1.successors(str(uid)))
            if len(friends) > 0:
                friend_ids = [int(network1.vs[vi]['name']) for vi in friends] # return id
                print uid in friend_ids
                print len(friend_ids)
                fatts = []
                alive = 0
                for fid in friend_ids:
                    fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
                    fu2 = com2.find_one({'id': fid})
                    if fu != None:
                        fatt = iot.get_fields_one_doc(fu, fields)
                        fatt.extend(active_days(fu))
                        fatt.extend([eigen_map.get(fu['id'])])

                        fatts.append(fatt)
                        if fu2 is None or fu2['timeline_count'] == 0:
                            alive += 0
                        else:
                            alive += 1
                if len(fatts) > 0:
                    fatts = np.array(fatts)
                    fmatts = np.mean(fatts, axis=0)
                    row.extend(fmatts)
                    row.append(len(fatts))
                    paliv = float(alive)/len(fatts)
                    print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                    row.append(paliv)
        # print row
        data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv('data-recover-following.csv', index = False)
Exemplo n.º 10
0
def attribute_corre(filename):
    fields = [
        'liwc_anal.result.posemo',
        'liwc_anal.result.negemo',
        'liwc_anal.result.ingest',
        'liwc_anal.result.bio',
        'liwc_anal.result.body',
        'liwc_anal.result.health',
        'liwc_anal.result.death'
        'liwc_anal.result.anx',
        'liwc_anal.result.anger',
        'liwc_anal.result.sad',
        'liwc_anal.result.i',
        'liwc_anal.result.we',
        'liwc_anal.result.negate',
        'liwc_anal.result.swear',
        'liwc_anal.result.social',
        'liwc_anal.result.family',
        'liwc_anal.result.friend',
        'liwc_anal.result.affect',
        'senti.result.whole.posm',
        # 'senti.result.whole.posstd',
        'senti.result.whole.negm',
        # 'senti.result.whole.negstd',
        'senti.result.whole.scalem',
        # 'senti.result.whole.scalestd',
        'senti.result.whole.N',
        'senti.result.prior.scalem',
        'senti.result.post.scalem'
    ]
    trimed_fields = ['-'.join(field.split('.')[-2:]) for field in fields]
    groups = [('ED', 'fed', 'com', 'fed', 'com_survival', {
        'liwc_anal.result.WC': {
            '$exists': True
        },
        'level': 1,
        'senti.result.whole.N': {
            '$gt': 10
        }
    }),
              ('RD', 'random', 'scom', 'random', 'com_survival', {
                  'liwc_anal.result.WC': {
                      '$exists': True
                  },
                  'senti.result.whole.N': {
                      '$gt': 10
                  }
              }),
              ('YG', 'younger', 'scom', 'younger', 'com_survival', {
                  'liwc_anal.result.WC': {
                      '$exists': True
                  },
                  'senti.result.whole.N': {
                      '$gt': 10
                  }
              })]
    data = []
    for tag, dbname, comname, dbname2, comname2, filter_values in groups:
        com = dbt.db_connect_col(dbname, comname)
        network1 = gt.Graph.Read_GraphML(tag.lower() +
                                         '-net-all-active.graphml')

        for user in com.find(filter_values, no_cursor_timeout=True):
            uid = user['id']
            level = user['level']
            values = iot.get_fields_one_doc(user, fields)
            exist = True
            try:
                v = network1.vs.find(name=str(uid))
            except ValueError:
                exist = False
            if exist:
                # friends = set(network1.neighbors(str(uid))) # id or name
                friends = set(network1.successors(str(uid)))
                fatts = []
                if len(friends) > 0:
                    friend_ids = [
                        int(network1.vs[vi]['name']) for vi in friends
                    ]  # return id
                    print uid in friend_ids
                    print len(friend_ids)
                    fatts = []
                    alive = 0
                    for fid in friend_ids:
                        fu = com.find_one({
                            'id': fid,
                            'liwc_anal.result.WC': {
                                '$exists': True
                            },
                            'senti.result.whole.N': {
                                '$gt': 10
                            }
                        })
                        if fu:
                            fatt = iot.get_fields_one_doc(fu, fields)
                            fatts.append(fatt)
                            alive += 1
                    if len(fatts) > 0:
                        fatts = np.array(fatts)
                        fmatts = np.mean(fatts, axis=0)
                        values.extend(fmatts)
                        data.append(
                            [user['id_str'], level, tag, alive,
                             len(friends)] + values)
    df = pd.DataFrame(
        data,
        columns=['uid', 'level', 'group', 'alive_friends', 'all_friends'] +
        ['u_' + field
         for field in trimed_fields] + ['f_' + tf for tf in trimed_fields])
    df.to_csv(filename)
Exemplo n.º 11
0
def user_profiles(dbname, comname, userfile='data/actor.uid'):
    # # get profile infor for regression
    uids = pickle.load(open(userfile))
    print len(uids)
    com = dbt.db_connect_col(dbname, comname)
    newcom = dbt.db_connect_col(dbname, 'pro_mention_miss_com')

    # newcom.create_index("id", unique=True)
    # # Collect miss data
    # missuids, taguids = [], []
    # for uid in uids:
    #     user = com.find_one({'id': int(uid)})
    #     if user is None:
    #         missuids.append(int(uid))
    #     else:
    #         taguids.append(int(uid))
    # list_size = len(missuids)
    # print '%d users to process' %list_size
    # length = int(math.ceil(list_size/100.0))
    # for index in xrange(length):
    #     index_begin = index*100
    #     index_end = min(list_size, index_begin+100)
    #     userlook.lookup_user_list(missuids[index_begin:index_end], newcom, 1, 'N')

    # # Collect tweets for missing users
    # converstream = dbt.db_connect_col(dbname, 'pro_mention_timeline')
    # most_recenty = converstream.find().sort([('id', -1)]).limit(1)
    # oldest = converstream.find().sort([('id', 1)]).limit(1)
    # max_id = most_recenty[0]['id']
    # since_id = oldest[0]['id']
    # print most_recenty[0]
    # print oldest[0]
    # com = dbt.db_connect_col(dbname, 'pro_mention_miss_com')
    # timeline = dbt.db_connect_col(dbname, 'pro_mention_miss_timeline')

    # com.create_index([('timeline_scraped_times', pymongo.ASCENDING)])
    # timeline.create_index([('user.id', pymongo.ASCENDING),
    #                       ('id', pymongo.DESCENDING)])
    # timeline.create_index([('id', pymongo.ASCENDING)], unique=True)

    # print datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Connect Twitter.com'
    # timelines.retrieve_timeline(com, timeline, max_id)
    # print datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'finish timeline for sample users'

    data = []
    fields = iot.read_fields()
    miss_count = 0
    print fields
    for uid in uids:
        user = com.find_one({'id': int(uid)})
        if user is not None:
            row = iot.get_fields_one_doc(user, fields)
            data.append(row)
        else:
            user = newcom.find_one({'id': int(uid)})
            if user is not None:
                row = iot.get_fields_one_doc(user, fields)
                data.append(row)
            else:
                miss_count += 1
    print miss_count, miss_count * 1.0 / len(uids)
    df = pd.DataFrame(data=data, columns=['uid', 'posemo', 'negemo', 'senti'])
    df.to_csv('data/emotions.csv')
Exemplo n.º 12
0
def emotion_dropout_IV_split(dbname1, dbname2, comname1, comname2):
    '''
    Split followees and followers as different variables
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}}
    user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)
    com1 = dbt.db_connect_col(dbname1, comname1)
    com2 = dbt.db_connect_col(dbname2, comname2)
    fields = ['liwc_anal.result.posemo',
              'liwc_anal.result.negemo',
              'liwc_anal.result.anx',
              'liwc_anal.result.anger',
              'liwc_anal.result.sad']
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days']
    attr_names = ['uid', 'attr']
    attr_names.extend(['u_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend(['fr_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['fr_'+field for field in prof_names])
    attr_names.extend(['fr_num', 'fr_palive'])
    attr_names.extend(['fo_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['fo_'+field for field in prof_names])
    attr_names.extend(['fo_num', 'fo_palive'])
    attr_names.extend(['co_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['co_'+field for field in prof_names])
    attr_names.extend(['co_num', 'co_palive'])
    print attr_names
    attr_length = len(fields) + len(prof_names) + 2
    network1 = gt.load_network(dbname1, 'net')
    data = []
    for uid in user1:
        row = [uid]
        u1 = com1.find_one({'id': uid})
        u2 = com2.find_one({'id': uid})
        if u2 is None or u2['timeline_count'] == 0:
            row.append(1)
        else:
            row.append(0)
        uatt = iot.get_fields_one_doc(u1, fields)
        row.extend(uatt)
        row.extend(active_days(u1))
        exist = True
        try:
            v = network1.vs.find(name=str(uid))
        except ValueError:
            exist = False
        if exist:
            print '--------------------user %d---------------' %uid
            followees = set([int(network1.vs[v]['name']) for v in network1.successors(str(uid))])
            followers = set([int(network1.vs[v]['name']) for v in network1.predecessors(str(uid))])
            common = followees.intersection(followers)
            followees = followees - common
            followers = followers - common
            for friend_ids in [followees, followers, common]:
                if len(friend_ids) > 0:
                    # friend_ids = [int(network1.vs[v]['name']) for v in friends]
                    print uid in friend_ids
                    print len(friend_ids)
                    fatts = []
                    alive = 0
                    for fid in friend_ids:
                        fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
                        fu2 = com2.find_one({'id': fid})
                        if fu != None:
                            fatt = iot.get_fields_one_doc(fu, fields) # Friends' LIWC
                            fatt.extend(active_days(fu))
                            fatts.append(fatt)
                            if fu2 is None or fu2['timeline_count'] == 0:
                                alive += 0
                            else:
                                alive += 1
                    if len(fatts) > 0:
                        fatts = np.array(fatts)
                        fmatts = np.mean(fatts, axis=0)
                        row.extend(fmatts)
                        row.append(len(fatts))
                        paliv = float(alive)/len(fatts)
                        print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                        row.append(paliv)
                else:
                    row.extend([None] * attr_length)
            # friends = followers # followers
            # if len(friends) > 0:
            #     friend_ids = [int(network1.vs[v]['name']) for v in friends]
            #     print uid in friend_ids
            #     print len(friend_ids)
            #     fatts = []
            #     alive = 0
            #     for fid in friend_ids:
            #         fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
            #         fu2 = com2.find_one({'id': fid})
            #         if fu != None:
            #             fatt = iot.get_fields_one_doc(fu, fields)
            #             fatt.extend(active_days(fu))
            #             fatts.append(fatt)
            #             if fu2 is None or fu2['timeline_count'] == 0:
            #                 alive += 0
            #             else:
            #                 alive += 1
            #     if len(fatts) > 0:
            #         fatts = np.array(fatts)
            #         fmatts = np.mean(fatts, axis=0)
            #         row.extend(fmatts)
            #         row.append(len(fatts))
            #         paliv = float(alive)/len(fatts)
            #         print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
            #         row.append(paliv)
        # print row
        data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv('data-attr-split.csv', index = False)
Exemplo n.º 13
0
def emotion_dropout_IV_combine(dbname1, dbname2, comname1, comname2):
    '''
    Combine followees and follower together as variables
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}}
    user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)
    com1 = dbt.db_connect_col(dbname1, comname1)
    com2 = dbt.db_connect_col(dbname2, comname2)
    fields = ['liwc_anal.result.posemo',
              'liwc_anal.result.negemo',
              'liwc_anal.result.anx',
              'liwc_anal.result.anger',
              'liwc_anal.result.sad']
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days']
    attr_names = ['uid', 'attr']
    attr_names.extend(['u_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend(['f_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['f_'+field for field in prof_names])
    attr_names.extend(['f_num', 'f_palive'])
    print attr_names
    network1 = gt.load_network(dbname1, 'net')
    data = []
    for uid in user1:
        row = [uid]
        u1 = com1.find_one({'id': uid})
        u2 = com2.find_one({'id': uid})
        if u2 is None or u2['timeline_count'] == 0:
            row.append(1)
        else:
            row.append(0)
        uatt = iot.get_fields_one_doc(u1, fields)
        row.extend(uatt)
        row.extend(active_days(u1))
        exist = True
        try:
            v = network1.vs.find(name=str(uid))
        except ValueError:
            exist = False
        if exist:
            friends = set(network1.neighbors(str(uid))) # id or name
            if len(friends) > 0:
                friend_ids = [int(network1.vs[v]['name']) for v in friends] # return id
                print uid in friend_ids
                print len(friend_ids)
                fatts = []
                alive = 0
                for fid in friend_ids:
                    fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
                    fu2 = com2.find_one({'id': fid})
                    if fu != None:
                        fatt = iot.get_fields_one_doc(fu, fields)
                        fatt.extend(active_days(fu))
                        fatts.append(fatt)
                        if fu2 is None or fu2['timeline_count'] == 0:
                            alive += 0
                        else:
                            alive += 1
                if len(fatts) > 0:
                    fatts = np.array(fatts)
                    fmatts = np.mean(fatts, axis=0)
                    row.extend(fmatts)
                    row.append(len(fatts))
                    paliv = float(alive)/len(fatts)
                    print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                    row.append(paliv)
        # print row
        data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv('data-attr-combine.csv', index = False)
Exemplo n.º 14
0
def emotion_dropout_IV_split(dbname1, dbname2, comname1, comname2):
    '''
    Split followees and followers as different variables
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''
    filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}}
    user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)
    com1 = dbt.db_connect_col(dbname1, comname1)
    com2 = dbt.db_connect_col(dbname2, comname2)
    fields = ['liwc_anal.result.posemo',
              'liwc_anal.result.negemo',
              'liwc_anal.result.anx',
              'liwc_anal.result.anger',
              'liwc_anal.result.sad'
              ]
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days', 'eigenvector']
    attr_names = ['uid', 'attr']
    attr_names.extend(['u_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend(['fr_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['fr_'+field for field in prof_names])
    attr_names.extend(['fr_num', 'fr_palive'])
    attr_names.extend(['fo_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['fo_'+field for field in prof_names])
    attr_names.extend(['fo_num', 'fo_palive'])
    attr_names.extend(['co_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['co_'+field for field in prof_names])
    attr_names.extend(['co_num', 'co_palive'])
    print attr_names
    attr_length = len(fields) + len(prof_names) + 2
    network1 = gt.load_network(dbname1, 'net')

    '''Centralities Calculation'''
    eigen = network1.eigenvector_centrality()
    # closeness = network1.closeness()
    # betweenness = network1.betweenness()
    nodes = [int(v['name']) for v in network1.vs]
    eigen_map = dict(zip(nodes, eigen))
    # closeness_map = dict(zip(nodes, closeness))
    # betweenness_map = dict(zip(nodes, betweenness))

    data = []
    for uid in user1:
        row = [uid]
        u1 = com1.find_one({'id': uid})
        u2 = com2.find_one({'id': uid})
        if u2 is None or u2['timeline_count'] == 0:
            row.append(1)
        else:
            row.append(0)
        uatt = iot.get_fields_one_doc(u1, fields)
        row.extend(uatt)
        row.extend(active_days(u1))
        row.extend([eigen_map.get(u1['id'])])


        exist = True
        try:
            v = network1.vs.find(name=str(uid))
        except ValueError:
            exist = False
        if exist:
            print '--------------------user %d---------------' %uid
            followees = set([int(network1.vs[v]['name']) for v in network1.successors(str(uid))])
            followers = set([int(network1.vs[v]['name']) for v in network1.predecessors(str(uid))])
            common = followees.intersection(followers)
            followees = followees - common
            followers = followers - common
            for friend_ids in [followees, followers, common]:
                if len(friend_ids) > 0:
                    # friend_ids = [int(network1.vs[v]['name']) for v in friends]
                    print uid in friend_ids
                    print len(friend_ids)
                    fatts = []
                    alive = 0
                    for fid in friend_ids:
                        fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
                        fu2 = com2.find_one({'id': fid})
                        if fu != None:
                            fatt = iot.get_fields_one_doc(fu, fields) # Friends' LIWC
                            fatt.extend(active_days(fu))
                            fatt.extend([eigen_map.get(fu['id'])])

                            fatts.append(fatt)
                            if fu2 is None or fu2['timeline_count'] == 0:
                                alive += 0
                            else:
                                alive += 1
                    if len(fatts) > 0:
                        fatts = np.array(fatts)
                        fmatts = np.mean(fatts, axis=0)
                        row.extend(fmatts)
                        row.append(len(fatts))
                        paliv = float(alive)/len(fatts)
                        print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                        row.append(paliv)
                else:
                    row.extend([None] * attr_length)
            # friends = followers # followers
            # if len(friends) > 0:
            #     friend_ids = [int(network1.vs[v]['name']) for v in friends]
            #     print uid in friend_ids
            #     print len(friend_ids)
            #     fatts = []
            #     alive = 0
            #     for fid in friend_ids:
            #         fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}})
            #         fu2 = com2.find_one({'id': fid})
            #         if fu != None:
            #             fatt = iot.get_fields_one_doc(fu, fields)
            #             fatt.extend(active_days(fu))
            #             fatts.append(fatt)
            #             if fu2 is None or fu2['timeline_count'] == 0:
            #                 alive += 0
            #             else:
            #                 alive += 1
            #     if len(fatts) > 0:
            #         fatts = np.array(fatts)
            #         fmatts = np.mean(fatts, axis=0)
            #         row.extend(fmatts)
            #         row.append(len(fatts))
            #         paliv = float(alive)/len(fatts)
            #         print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
            #         row.append(paliv)
        # print row
        data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv('data-attr-split.csv', index = False)
Exemplo n.º 15
0
def read_user_time_iv(filename):
    # fields = iot.read_fields()
    fields = [
        'liwc_anal.result.posemo', 'liwc_anal.result.negemo',
        'liwc_anal.result.ingest', 'liwc_anal.result.bio',
        'liwc_anal.result.body', 'liwc_anal.result.health',
        'liwc_anal.result.death'
        'liwc_anal.result.anx', 'liwc_anal.result.anger',
        'liwc_anal.result.sad'
    ]
    prof_names = [
        'friends_count', 'statuses_count', 'followers_count', 'friends_day',
        'statuses_day', 'followers_day', 'days'
    ]

    trimed_fields = [field.split('.')[-1] for field in fields]
    groups = [('ED', 'fed', 'com', 'fed_sur', 'com',
               '2017-06-21 14:57:39+00:00', {
                   'liwc_anal.result.WC': {
                       '$exists': True
                   },
                   'level': 1
               }),
              ('RD', 'random', 'scom', 'random_sur', 'com',
               '2017-06-21 14:57:39+00:00', {
                   'liwc_anal.result.WC': {
                       '$exists': True
                   }
               }),
              ('YG', 'younger', 'scom', 'younger_sur', 'com',
               '2017-06-21 14:57:39+00:00', {
                   'liwc_anal.result.WC': {
                       '$exists': True
                   }
               })]

    data = []
    for tag, dbname, comname, dbname2, comname2, second_time, filter_values in groups:
        com = dbt.db_connect_col(dbname, comname)
        com2 = dbt.db_connect_col(dbname2, comname2)
        network1 = gt.Graph.Read_GraphML(tag.lower() + '-net.graphml')
        gt.summary(network1)
        network1_gc = gt.giant_component(network1)
        gt.summary(network1_gc)
        '''Centralities Calculation'''
        eigen = network1_gc.eigenvector_centrality()
        pageranks = network1_gc.pagerank()
        indegree = network1_gc.authority_score()
        outdegree = network1_gc.hub_score()

        nodes = [int(v['name']) for v in network1_gc.vs]
        eigen_map = dict(zip(nodes, eigen))
        pagerank_map = dict(zip(nodes, pageranks))
        indegree_map = dict(zip(nodes, indegree))
        outdegree_map = dict(zip(nodes, outdegree))

        print 'load liwc 2 batches: ' + tag.lower() + '-liwc2stage.csv'
        liwc_df = pd.read_pickle(tag.lower() + '-liwc2stage.csv' + '.pick')

        for user in com.find(filter_values, no_cursor_timeout=True):
            first_scraped_at = user['_id'].generation_time.replace(tzinfo=None)
            if 'status' in user:
                uid = user['id']
                u2 = com2.find_one({'id': uid})

                first_last_post = datetime.strptime(
                    user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                last_post = first_last_post
                drop = 1
                if u2:
                    second_scraped_at = u2['_id'].generation_time.replace(
                        tzinfo=None)
                    if 'status' in u2:
                        second_last_post = datetime.strptime(
                            u2['status']['created_at'],
                            '%a %b %d %H:%M:%S +0000 %Y')
                        if first_scraped_at < second_last_post < second_scraped_at:
                            drop = 0
                            last_post = second_last_post

                created_at = datetime.strptime(user['created_at'],
                                               '%a %b %d %H:%M:%S +0000 %Y')
                life_time = diff_day(last_post, created_at)
                average_time = float(life_time) / min(1,
                                                      user['statuses_count'])
                longest_tweet_intervalb = user['longest_tweet_interval']
                u_timeline_count = user['timeline_count']

                values = iot.get_fields_one_doc(user, fields)
                level = user['level']

                # set users liwc changes
                uvs = liwc_df[liwc_df.user_id == str(uid)].loc[:,
                                                               trimed_fields]
                # print uvs
                if len(uvs) == 2:
                    changes, priors, posts = [], [], []
                    for name in trimed_fields:
                        old = uvs.iloc[0][name]
                        new = uvs.iloc[1][name]
                        priors.append(old)
                        posts.append(new)
                        changes.append(new - old)
                    liwc_changes = priors + posts + changes
                else:
                    liwc_changes = [None] * (len(trimed_fields) * 3)
                u_centrality = eigen_map.get(user['id'], 0)
                u_pagerank = pagerank_map.get(user['id'], 0)
                u_indegree = indegree_map.get(user['id'], 0)
                u_outdegree = outdegree_map.get(user['id'], 0)

                values.extend(liwc_changes)
                values.extend(active_days(user))
                '''Get friends' profiles'''
                exist = True
                try:
                    v = network1.vs.find(name=str(uid))
                except ValueError:
                    exist = False
                if exist:
                    # friends = set(network1.neighbors(str(uid))) # id or name
                    friends = set(network1.successors(str(uid)))
                    if len(friends) > 0:
                        friend_ids = [
                            int(network1.vs[vi]['name']) for vi in friends
                        ]  # return id
                        print uid in friend_ids
                        print len(friend_ids)
                        fatts = []
                        alive = 0
                        for fid in friend_ids:
                            fu = com.find_one({
                                'id': fid,
                                'liwc_anal.result.WC': {
                                    '$exists': True
                                }
                            })
                            fu2 = com2.find_one({'id': fid})

                            if fu:
                                f1_time = fu['_id'].generation_time.replace(
                                    tzinfo=None)
                                # if eigen_map.get(fu['id'], 0) > 0.0001:
                                if True:
                                    fatt = iot.get_fields_one_doc(fu, fields)
                                    factive = active_days(fu)
                                    if fu2:
                                        f2_time = fu2[
                                            '_id'].generation_time.replace(
                                                tzinfo=None)
                                        if 'status' in fu2:
                                            fsecond_last_post = datetime.strptime(
                                                fu2['status']['created_at'],
                                                '%a %b %d %H:%M:%S +0000 %Y')
                                            if f1_time < fsecond_last_post < f2_time:
                                                alive += 1
                                                factive = active_days(fu2)

                                    fatt.extend(factive)
                                    fatt.extend([
                                        eigen_map.get(fu['id'], 0),
                                        pagerank_map.get(fu['id'], 0),
                                        indegree_map.get(fu['id'], 0),
                                        outdegree_map.get(fu['id'], 0)
                                    ])
                                    fatts.append(fatt)

                        # thredhold = user['friends_count']*0.5

                        if len(fatts) > 0:
                            fatts = np.array(fatts)
                            fmatts = np.mean(fatts, axis=0)
                            values.extend(fmatts)
                            paliv = float(alive) / len(fatts)
                            data.append([
                                user['id_str'], level, drop, created_at,
                                first_last_post, second_last_post, last_post,
                                first_scraped_at, second_scraped_at,
                                average_time, longest_tweet_intervalb, tag,
                                u_centrality, u_pagerank, u_indegree,
                                u_outdegree, u_timeline_count
                            ] + values + [len(fatts), paliv])

    df = pd.DataFrame(
        data,
        columns=[
            'uid', 'level', 'dropout', 'created_at', 'first_last_post',
            'second_last_post', 'last_post', 'first_scraped_at',
            'second_scraped_at', 'average_time', 'longest_time_interval',
            'group', 'u_eigenvector', 'u_pagerank', 'u_authority', 'u_hub',
            'u_timeline_count'
        ] + ['u_' + field for field in trimed_fields] +
        ['u_prior_' + field for field in trimed_fields] +
        ['u_post_' + field for field in trimed_fields] +
        ['u_change_' + field
         for field in trimed_fields] + ['u_' + field for field in prof_names] +
        ['f_' + tf
         for tf in trimed_fields] + ['f_' + field for field in prof_names] + [
             'f_eigenvector', 'f_pagerank', 'f_authority', 'f_hub', 'f_num',
             'f_palive'
         ])
    df.to_csv(filename)
Exemplo n.º 16
0
def emotion_dropout_IV_following(filepath):
    '''
    Only use following stats
    :param dbname1:
    :param dbname2:
    :param comname1:
    :param comname2:
    :return:
    '''

    fields = [
            'senti.result.whole.posm',
            'senti.result.whole.posstd',
            'senti.result.whole.negm',
            'senti.result.whole.negstd',
            'senti.result.whole.scalem',
            'senti.result.whole.scalestd',
            'senti.result.whole.N',
            'senti.result.prior.scalem',
            'senti.result.post.scalem',
              # 'liwc_anal.result.posemo',
              # 'liwc_anal.result.negemo',
              # 'liwc_anal.result.ingest',
              # 'liwc_anal.result.bio',
              # 'liwc_anal.result.body',
              # 'liwc_anal.result.health',
              # 'liwc_anal.result.death'
              # 'liwc_anal.result.anx',
              # 'liwc_anal.result.anger',
              # 'liwc_anal.result.sad'
              ]
    trimed_fields = ['-'.join(field.split('.')[-2: -1]) for field in fields]
    prof_names = ['friends_count', 'statuses_count', 'followers_count',
        'friends_day', 'statuses_day', 'followers_day', 'days', 'eigenvector', 'pagerank', 'authority', 'hub']
    attr_names = ['uid', 'group', 'attr', 'level']
    attr_names.extend(['u_'+field for field in trimed_fields])
    # attr_names.extend(['u_prior_'+field for field in trimed_fields])
    # attr_names.extend(['u_post_'+field for field in trimed_fields])
    # attr_names.extend(['u_change_'+field for field in trimed_fields])
    attr_names.extend(['u_'+field for field in prof_names])
    attr_names.extend([
        # 'u_recovery_tweets',
                       'u_timeline_count'])
    attr_names.extend(['f_'+field.split('.')[-1] for field in fields])
    attr_names.extend(['f_'+field for field in prof_names])
    attr_names.extend(['f_timeline_count', 'f_num', 'f_palive'])
    print attr_names

    data = []
    name_map = {
        'ed': ('fed', 'fed_sur', 'com', 'com', {'level': 1, 'liwc_anal.result.WC': {'$exists': True}}),
        'yg': ('younger', 'younger_sur', 'scom', 'com', {'liwc_anal.result.WC': {'$exists': True}}),
        'rd': ('random', 'random_sur', 'scom', 'com', {'liwc_anal.result.WC': {'$exists': True}})
    }
    for groupname in [
        'yg', 'rd',
        'ed']:
        dbname1, dbname2, comname1, comname2, filter_que = name_map[groupname]
        print 'Centrality Calculate .........'
        # users = iot.get_values_one_field('fed', 'com', 'id', {'level': {'$lt': 3}})

        # print 'Number of users', len(users)
        # network1 = gt.load_network_subset('fed', 'net', {'user': {'$in': users}, 'follower': {'$in': users}})
        # network1 = gt.load_network('fed', 'net')
        # pickle.dump(network1, open('net.pick', 'w'))

        print 'load network: ' + groupname+'-net.graphml'
        network1= gt.Graph.Read_GraphML(groupname+'-net.graphml')
        # network1 = pickle.load(open('net.pick', 'r'))
        gt.summary(network1)
        network1_gc = gt.giant_component(network1)
        gt.summary(network1_gc)

        '''Centralities Calculation'''
        eigen = network1_gc.eigenvector_centrality()
        pageranks = network1_gc.pagerank()
        indegree = network1_gc.authority_score()
        outdegree = network1_gc.hub_score()
        # closeness = network.closeness()
        # betweenness = network.betweenness()
        # print len(eigen), len(closeness), len(betweenness)

        nodes = [int(v['name']) for v in network1_gc.vs]
        # print len(nodes), len(eigen)
        # print type(nodes), type(eigen)

        eigen_map = dict(zip(nodes, eigen))
        pagerank_map = dict(zip(nodes, pageranks))
        indegree_map = dict(zip(nodes, indegree))
        outdegree_map = dict(zip(nodes, outdegree))
        # print eigen_map.get(nodes[1]), type(eigen_map.get(nodes[1]))

        # closeness_map = dict(zip(nodes, closeness))
        # betweenness_map = dict(zip(nodes, betweenness))
        print 'Centrality Calculate .........'

        # print 'load liwc 2 batches: ' + groupname+'-liwc2stage.csv'
        # df = pd.read_pickle(groupname+'-liwc2stage.csv'+'.pick')

        user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que)

        print 'load db1: ', dbname1, comname1
        com1 = dbt.db_connect_col(dbname1, comname1)
        print 'load db2: ', dbname2, comname2
        com2 = dbt.db_connect_col(dbname2, comname2)


        for uid in user1:
            # set uid
            row = [uid, groupname]
            # set attrition states
            u1 = com1.find_one({'id': uid})
            u2 = com2.find_one({'id': uid})
            u1_time = u1['_id'].generation_time.replace(tzinfo=None)

            # if u2 is None or u2['timeline_count'] == 0:
            drop = 1
            if u2:
                u2_time = u2['_id'].generation_time.replace(tzinfo=None)
                if 'status' in u2:
                    second_last_post = datetime.strptime(u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                    if u1_time < second_last_post < u2_time:
                        drop = 0
            row.append(drop)
            row.append(u1['level'])
            # set users liwc feature
            uatt = iot.get_fields_one_doc(u1, fields)
            row.extend(uatt)
            # # set users liwc changes
            # uvs = df[df.user_id == str(uid)].loc[:, trimed_fields]
            # # print uvs
            # if len(uvs) == 2:
            #     changes, priors, posts = [], [], []
            #     for name in trimed_fields:
            #         old = uvs.iloc[0][name]
            #         new = uvs.iloc[1][name]
            #         priors.append(old)
            #         posts.append(new)
            #         changes.append(new - old)
            #     row.extend(priors)
            #     row.extend(posts)
            #     row.extend(changes)
            # else:
            #     row.extend([None]*(len(trimed_fields)*3))

            # set profile, active days and eigenvector centrality
            print u1['id']
            row.extend(active_days(u1))
            row.extend([eigen_map.get(u1['id'], 0)])
            row.extend([pagerank_map.get(u1['id'], 0)])
            row.extend([indegree_map.get(u1['id'], 0)])
            row.extend([outdegree_map.get(u1['id'], 0)])
            row.extend([
                # u1['recovery_tweets'],
                u1['timeline_count']])

            exist = True
            try:
                v = network1.vs.find(name=str(uid))
            except ValueError:
                exist = False
            if exist:
                # friends = set(network1.neighbors(str(uid))) # id or name
                friends = set(network1.successors(str(uid)))
                if len(friends) > 0:
                    friend_ids = [int(network1.vs[vi]['name']) for vi in friends] # return id
                    print uid in friend_ids
                    print len(friend_ids)
                    fatts = []
                    alive = 0
                    for fid in friend_ids:
                        fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}})
                        fu2 = com2.find_one({'id': fid})

                        if fu:
                            f1_time = fu['_id'].generation_time.replace(tzinfo=None)
                            # if eigen_map.get(fu['id'], 0) > 0.0001:
                            if True:
                                fatt = iot.get_fields_one_doc(fu, fields)
                                factive = active_days(fu)
                                if fu2:
                                    f2_time = fu2['_id'].generation_time.replace(tzinfo=None)
                                    if 'status' in fu2:
                                        fsecond_last_post = datetime.strptime(fu2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
                                        if f1_time < fsecond_last_post < f2_time:
                                            alive += 1
                                            factive = active_days(fu2)

                                fatt.extend(factive)
                                fatt.extend([eigen_map.get(fu['id'], 0)])
                                fatt.extend([pagerank_map.get(fu['id'], 0)])
                                fatt.extend([indegree_map.get(fu['id'], 0)])
                                fatt.extend([outdegree_map.get(fu['id'], 0)])
                                fatt.extend([fu['timeline_count']])
                                fatts.append(fatt)

                    if len(fatts) > 0:
                        fatts = np.array(fatts)
                        fmatts = np.mean(fatts, axis=0)
                        row.extend(fmatts)
                        row.append(len(fatts))
                        paliv = float(alive)/len(fatts)
                        print 'Alive %d %d %.3f' % (alive, len(fatts), paliv)
                        row.append(paliv)
            # print row
            data.append(row)
    df = pd.DataFrame(data, columns=attr_names)
    df.to_csv(filepath, index = False)
Exemplo n.º 17
0
def data_split(dbname='TwitterProAna', colname='tweets'):
    # # https://stackoverflow.com/questions/8136652/query-mongodb-on-month-day-year-of-a-datetime
    # # Label tweets with dates
    # tweets = dbt.db_connect_col(dbname, colname)
    # # basedate = datetime(1970, 1, 1)
    # # tweets.create_index([('date_week', pymongo.ASCENDING)])
    # # for tweet in tweets.find({}, no_cursor_timeout=True):
    # #     creat = tweet['created_at']
    # #     detal = creat - basedate
    # #     datestr = detal.days // 7 + 1
    # #     tweets.update_one({'id': tweet['id']}, {'$set': {"date_week": datestr}}, upsert=False)
    #
    # # # Indexing tweets with dates
    # date_index = {}
    # for tweet in tweets.find({}, ['id', 'date_week'], no_cursor_timeout=True):
    #     tid, date = tweet['id'], tweet['date_week']
    #     tlist = date_index.get(date, [])
    #     tlist.append(tid)
    #     date_index[date] = tlist
    # pickle.dump(date_index, open('date_tid_list_week.pick', 'w'))
    #
    # # Bunch with tweets in give dates to produce LIWC results
    # # tweets = dbt.db_connect_col(dbname, colname)
    # # date_index = pickle.load(open('date_tid_list_week.pick', 'r'))
    # timeseries = dbt.db_connect_col(dbname, 'weekseries')
    # for key in date_index.keys():
    #     tlist = date_index[key]
    #     textmass = ''
    #     for tid in tlist:
    #         tweet = tweets.find_one({'id': tid})
    #         text = tweet['text'].encode('utf8')
    #         # replace RT, @, # and Http://
    #         match = rtgrex.search(text)
    #         if match is None:
    #             text = mgrex.sub('', text)
    #             text = hgrex.sub('', text)
    #             text = ugrex.sub('', text)
    #             text = text.strip()
    #             if not(text.endswith('.') or text.endswith('?') or text.endswith('!')):
    #                 text += '.'
    #             textmass += " " + text.lower()
    #     words = textmass.split()
    #     # Any text with fewer than 50 words should be looked at with a certain degree of skepticism.
    #     if len(words) > 50:
    #         liwc_result = liwc.summarize_document(' '.join(words))
    #         timeseries.insert({'date': key, 'liwc':liwc_result})

    timeseries = dbt.db_connect_col(dbname, 'weekseries')
    fields = iot.read_fields()
    fields_trim = [f.replace('liwc_anal.result.', '') for f in fields]
    fields = [f.replace('_anal.result', '') for f in fields]

    print len(fields)
    data = []
    basedate = datetime(1970, 1, 1)
    for entry in timeseries.find():
        time = entry['date']
        # date = datetime.strptime(time, '%Y-%m')
        # date = datetime.date(year=int(time[0]), month=int(time[1]))
        # detal = creat - basedate
    # #     datestr = detal.days // 7 + 1
        days = (time -1)*7
        date = basedate + datetime.timedelta(days=days)
        features = iot.get_fields_one_doc(entry, fields)
        data.append([date] + features)
    df = pd.DataFrame(data=data, columns=['date'] + fields_trim)
    df.to_csv('ian-liwc-tweets-week.csv')