def friend_network_hashtag_weight(dbname, netname): ''' Community detection on friendship network, weighted by hashtag similarity :param dbname: :param netname: :param user_hash_profile: :return: ''' user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r')) net = gt.load_network(dbname, netname) fields = iot.read_fields() com = dbt.db_connect_col(dbname, 'scom') for edge in net.es: source_vertex_id = edge.source target_vertex_id = edge.target source_uid = int(net.vs[source_vertex_id]['name']) target_uid = int(net.vs[target_vertex_id]['name']) source_user = com.find_one({'id':source_uid}) target_user = com.find_one({'id':target_uid}) source_user_liwc = iot.get_fields_one_doc(source_user, fields) target_user_liwc = iot.get_fields_one_doc(target_user, fields) source_user_liwc.extend(user_hash_profile[source_uid]) target_user_liwc.extend(user_hash_profile[target_uid]) print len(target_user_liwc) dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc) edge['weight'] = 1.0/(1.0 + dis) net.write_graphml('ed_weighted_follow.graphml')
def friend_network_hashtag_weight(dbname, netname): ''' Community detection on friendship network, weighted by hashtag similarity :param dbname: :param netname: :param user_hash_profile: :return: ''' user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r')) net = gt.load_network(dbname, netname) fields = iot.read_fields() com = dbt.db_connect_col(dbname, 'scom') for edge in net.es: source_vertex_id = edge.source target_vertex_id = edge.target source_uid = int(net.vs[source_vertex_id]['name']) target_uid = int(net.vs[target_vertex_id]['name']) source_user = com.find_one({'id': source_uid}) target_user = com.find_one({'id': target_uid}) source_user_liwc = iot.get_fields_one_doc(source_user, fields) target_user_liwc = iot.get_fields_one_doc(target_user, fields) source_user_liwc.extend(user_hash_profile[source_uid]) target_user_liwc.extend(user_hash_profile[target_uid]) print len(target_user_liwc) dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc) edge['weight'] = 1.0 / (1.0 + dis) net.write_graphml('ed_weighted_follow.graphml')
def read_user_time(filename): fields = iot.read_fields() trimed_fields = [field.split('.')[-1] for field in fields] groups = [ ('ED', 'fed', 'com', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}), ('RD', 'random', 'scom', {'liwc_anal.result.WC': {'$exists': True}}), ('YG', 'younger', 'scom', {'liwc_anal.result.WC': {'$exists': True}}) ] data = [] for tag, dbname, comname, filter_values in groups: com = dbt.db_connect_col(dbname, comname) for user in com.find(filter_values, no_cursor_timeout=True): if 'status' in user: created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y') scraped_at = user['scrape_timeline_at'] last_post = datetime.strptime(user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') life_time = diff_day(last_post, created_at) average_time = float(life_time)/min(1, user['statuses_count']) longest_tweet_intervalb = user['longest_tweet_interval'] observation_interval = diff_day(scraped_at, last_post) if (observation_interval-longest_tweet_intervalb) > 30: death = 1 else: death = 0 values = iot.get_fields_one_doc(user, fields) data.append([user['id_str'], created_at, last_post, scraped_at, average_time, longest_tweet_intervalb, observation_interval, tag, death] + values) df = pd.DataFrame(data, columns=['uid', 'created_at', 'last_post', 'scraped_at', 'average_time', 'longest_time_interval', 'observation_interval', 'group', 'event'] + trimed_fields) df.to_csv(filename)
def bio_all_in_line(dbname='TwitterProAna', colname='bio'): names = ['cbmi', 'gbmi', 'a', 'gender', 'h', 'cw', 'gw', 'lw', 'hw', 'ugw'] fields = ['id','date'] + [name+'.value' for name in names] data = [] bio = dbt.db_connect_col(dbname, colname) for entry in bio.find({}): dat = (iot.get_fields_one_doc(entry, fields)) if set(dat[3:]) != set([0.0]): data.append(dat) df = pd.DataFrame(data=data, columns=['id','date'] +names) df.to_csv('ian-all'+'.csv')
def bmi_regreesion(dbname, colname, filename): # regress bmi with features fields = iot.read_fields() poi_fields = fields[-9:-1] print poi_fields trimed_fields = [(field.split('.')[-1]) for field in fields] trimed_fields[-10:] = [ 'sentiment', 'age', 'gender', 'height', 'cw', 'gw', 'cbmi', 'gbmi', 'edword', 'level' ] com = dbutil.db_connect_col(dbname, colname) data = [] # for user in com.find({'$or': [{'text_anal.cbmi.value': {'$exists': True}}, # {'text_anal.gbmi.value': {'$exists': True}}], # 'liwc_anal.result.WC': {'$exists': True}}, no_cursor_timeout=True): com2 = dbutil.db_connect_col('fed2', colname) com3 = dbutil.db_connect_col('fed3', colname) for user in com.find({'liwc_anal.result.WC': { '$exists': True }}, no_cursor_timeout=True): values = iot.get_fields_one_doc(user, fields) user2 = com2.find_one({'id': user['id']}) if user2: values.extend(iot.get_fields_one_doc(user2, poi_fields)) else: values.extend([0] * len(poi_fields)) user3 = com3.find_one({'id': user['id']}) if user3: values.extend(iot.get_fields_one_doc(user3, poi_fields)) else: values.extend([0] * len(poi_fields)) data.append(values) df = pd.DataFrame(data, columns=trimed_fields + [(field.split('.')[-2] + '_p2') for field in poi_fields] + [(field.split('.')[-2] + '_p3') for field in poi_fields]) df.to_csv(filename)
def color_classify(userlabels, field_names, file_name, dbname): fw = open(file_name + '.data', 'w') db = dbt.db_connect_no_auth(dbname) poi = db['com'] # format: 6,7,11,12 1:-0.022711 2:-0.050504 3:-0.035691 for uid in userlabels.keys(): labels = (userlabels[uid]) user = poi.find_one({'id': uid}, ['liwc_anal.result']) values = io.get_fields_one_doc(user, field_names) outstr = ','.join(str(x) for x in labels) outstr += ' ' for i in xrange(len(values)): outstr += str(i + 1) + ':' + str(values[i]) + ' ' fw.write(outstr + '\n') fw.close()
def feature_output(field_names, file_name, dbname, colname, label=None, outids=False, userset=[], extend_features={}): fw = open(file_name + '.data', 'w') poi = dbt.db_connect_col(dbname, colname) index = 0 maxsize = 10000000000000000 uids = list() # exclude_set = set([4319191638L, 2627223434L, 2976822286L, 4788248335L, 3289264086L, 520847919, 439647015, 947539758, 617442479, 2481703728L, 2913311029L, 3760687289L, 2303011905L, 1712561862, 2882255303L, 261549132, 982895821, 2849269327L, 312684498, 160044558, 774072534, 330611545, 430569947, 1275228253, 3399616094L, 2924322143L, 457692129, 3006221026L, 2837359399L, 18942418, 2848241137L, 273768180, 235857269, 3315086840L]) for x in poi.find({ # 'text_anal.edword_count.value': {'$gt': 0}, # 'id': {'$in': userset}, 'liwc_anal.result.WC': { '$exists': True }, # 'text_anal.gbmi': {'$exists': True}, # 'timeline_count': {'$gt': 100}, # 'level': {'$gt': 1} }): # if index < maxsize and int(x['id']) not in exclude_set: if index < maxsize: uid = int(x['id']) uids.append(uid) values = io.get_fields_one_doc(x, field_names) # if uid in extend_features: # values.extend(extend_features[uid]) if label: outstr = label + ' ' else: outstr = x['id_str'] + ' ' for i in xrange(len(values)): outstr += str(i + 1) + ':' + str(values[i]) + ' ' index += 1 fw.write(outstr + '\n') fw.close() print len(uids) if outids: pickle.dump(uids, open(file_name + '_ids.data', 'w'))
def out_data(): control = dbt.db_connect_col('fed', 'control_com') treat = dbt.db_connect_col('fed', 'treat_com') control_user = iot.get_values_one_field('fed', 'control_com', 'id', {'prior_liwc.result.WC':{'$exists': True}, 'post_liwc.result.WC':{'$exists': True}}) treat_user = iot.get_values_one_field('fed', 'treat_com', 'id', {'prior_liwc.result.WC':{'$exists': True}, 'post_liwc.result.WC':{'$exists': True}}) data = [] fields = iot.read_fields() prefix = ['prior_liwc', 'post_liwc'] for i in xrange(2): uids = [control_user, treat_user][i] for uid in uids: user = [control, treat][i].find_one({'id': uid}) for j in xrange(2): fields_new = ['id_str']+[field.replace('liwc_anal', prefix[j]) for field in fields] values = iot.get_fields_one_doc(user, fields_new) data.append(values+[i, j]) df = pd.DataFrame(data, columns=['id']+[field.split('.')[-1] for field in fields]+['treated', 'time']) df.to_csv('treatment.csv')
def emotion_recovery_IV_following(dbname1, dbname2, comname1, comname2): ''' Only use following stats :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' print 'load liwc 2 batches' df = pd.read_pickle('ed-liwc2stage.csv'+'.pick') filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}} user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) com1 = dbt.db_connect_col(dbname1, comname1) com2 = dbt.db_connect_col(dbname2, comname2) fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.death' # 'liwc_anal.result.anx', # 'liwc_anal.result.anger', # 'liwc_anal.result.sad' ] trimed_fields = [field.split('.')[-1] for field in fields] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days', 'eigenvector'] attr_names = ['uid', 'attr', 'u_timeline_count_2p'] attr_names.extend(['u_'+field for field in trimed_fields]) attr_names.extend(['u_prior_'+field for field in trimed_fields]) attr_names.extend(['u_post_'+field for field in trimed_fields]) attr_names.extend(['u_change_'+field for field in trimed_fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend(['u_recovery_tweets', 'u_timeline_count']) attr_names.extend(['f_'+field.split('.')[-1] for field in fields]) attr_names.extend(['f_'+field for field in prof_names]) attr_names.extend(['f_num', 'f_palive']) print attr_names data = [] for uid in user1: # set uid row = [uid] # set attrition states u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if u2 is None or u2['timeline_count'] == 0: row.extend([None]*2) else: row.extend([u2['recovery_tweets'], u2['timeline_count']]) # set users liwc feature uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) # set users liwc changes uvs = df[df.user_id == str(uid)].loc[:, trimed_fields] # print uvs if len(uvs) == 2: changes, priors, posts = [], [], [] for name in trimed_fields: old = uvs.iloc[0][name] new = uvs.iloc[1][name] priors.append(old) posts.append(new) changes.append(new - old) row.extend(priors) row.extend(posts) row.extend(changes) else: row.extend([None]*(len(trimed_fields)*3)) # set profile, active days and eigenvector centrality row.extend(active_days(u1)) row.extend([eigen_map.get(u1['id'])]) row.extend([u1['recovery_tweets'], u1['timeline_count']]) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: # friends = set(network1.neighbors(str(uid))) # id or name friends = set(network1.successors(str(uid))) if len(friends) > 0: friend_ids = [int(network1.vs[vi]['name']) for vi in friends] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu != None: fatt = iot.get_fields_one_doc(fu, fields) fatt.extend(active_days(fu)) fatt.extend([eigen_map.get(fu['id'])]) fatts.append(fatt) if fu2 is None or fu2['timeline_count'] == 0: alive += 0 else: alive += 1 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv('data-recover-following.csv', index = False)
def attribute_corre(filename): fields = [ 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.death' 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad', 'liwc_anal.result.i', 'liwc_anal.result.we', 'liwc_anal.result.negate', 'liwc_anal.result.swear', 'liwc_anal.result.social', 'liwc_anal.result.family', 'liwc_anal.result.friend', 'liwc_anal.result.affect', 'senti.result.whole.posm', # 'senti.result.whole.posstd', 'senti.result.whole.negm', # 'senti.result.whole.negstd', 'senti.result.whole.scalem', # 'senti.result.whole.scalestd', 'senti.result.whole.N', 'senti.result.prior.scalem', 'senti.result.post.scalem' ] trimed_fields = ['-'.join(field.split('.')[-2:]) for field in fields] groups = [('ED', 'fed', 'com', 'fed', 'com_survival', { 'liwc_anal.result.WC': { '$exists': True }, 'level': 1, 'senti.result.whole.N': { '$gt': 10 } }), ('RD', 'random', 'scom', 'random', 'com_survival', { 'liwc_anal.result.WC': { '$exists': True }, 'senti.result.whole.N': { '$gt': 10 } }), ('YG', 'younger', 'scom', 'younger', 'com_survival', { 'liwc_anal.result.WC': { '$exists': True }, 'senti.result.whole.N': { '$gt': 10 } })] data = [] for tag, dbname, comname, dbname2, comname2, filter_values in groups: com = dbt.db_connect_col(dbname, comname) network1 = gt.Graph.Read_GraphML(tag.lower() + '-net-all-active.graphml') for user in com.find(filter_values, no_cursor_timeout=True): uid = user['id'] level = user['level'] values = iot.get_fields_one_doc(user, fields) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: # friends = set(network1.neighbors(str(uid))) # id or name friends = set(network1.successors(str(uid))) fatts = [] if len(friends) > 0: friend_ids = [ int(network1.vs[vi]['name']) for vi in friends ] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com.find_one({ 'id': fid, 'liwc_anal.result.WC': { '$exists': True }, 'senti.result.whole.N': { '$gt': 10 } }) if fu: fatt = iot.get_fields_one_doc(fu, fields) fatts.append(fatt) alive += 1 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) values.extend(fmatts) data.append( [user['id_str'], level, tag, alive, len(friends)] + values) df = pd.DataFrame( data, columns=['uid', 'level', 'group', 'alive_friends', 'all_friends'] + ['u_' + field for field in trimed_fields] + ['f_' + tf for tf in trimed_fields]) df.to_csv(filename)
def user_profiles(dbname, comname, userfile='data/actor.uid'): # # get profile infor for regression uids = pickle.load(open(userfile)) print len(uids) com = dbt.db_connect_col(dbname, comname) newcom = dbt.db_connect_col(dbname, 'pro_mention_miss_com') # newcom.create_index("id", unique=True) # # Collect miss data # missuids, taguids = [], [] # for uid in uids: # user = com.find_one({'id': int(uid)}) # if user is None: # missuids.append(int(uid)) # else: # taguids.append(int(uid)) # list_size = len(missuids) # print '%d users to process' %list_size # length = int(math.ceil(list_size/100.0)) # for index in xrange(length): # index_begin = index*100 # index_end = min(list_size, index_begin+100) # userlook.lookup_user_list(missuids[index_begin:index_end], newcom, 1, 'N') # # Collect tweets for missing users # converstream = dbt.db_connect_col(dbname, 'pro_mention_timeline') # most_recenty = converstream.find().sort([('id', -1)]).limit(1) # oldest = converstream.find().sort([('id', 1)]).limit(1) # max_id = most_recenty[0]['id'] # since_id = oldest[0]['id'] # print most_recenty[0] # print oldest[0] # com = dbt.db_connect_col(dbname, 'pro_mention_miss_com') # timeline = dbt.db_connect_col(dbname, 'pro_mention_miss_timeline') # com.create_index([('timeline_scraped_times', pymongo.ASCENDING)]) # timeline.create_index([('user.id', pymongo.ASCENDING), # ('id', pymongo.DESCENDING)]) # timeline.create_index([('id', pymongo.ASCENDING)], unique=True) # print datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + "\t" + 'Connect Twitter.com' # timelines.retrieve_timeline(com, timeline, max_id) # print datetime.now().strftime("%Y-%m-%d-%H-%M-%S"), 'finish timeline for sample users' data = [] fields = iot.read_fields() miss_count = 0 print fields for uid in uids: user = com.find_one({'id': int(uid)}) if user is not None: row = iot.get_fields_one_doc(user, fields) data.append(row) else: user = newcom.find_one({'id': int(uid)}) if user is not None: row = iot.get_fields_one_doc(user, fields) data.append(row) else: miss_count += 1 print miss_count, miss_count * 1.0 / len(uids) df = pd.DataFrame(data=data, columns=['uid', 'posemo', 'negemo', 'senti']) df.to_csv('data/emotions.csv')
def emotion_dropout_IV_split(dbname1, dbname2, comname1, comname2): ''' Split followees and followers as different variables :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}} user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) com1 = dbt.db_connect_col(dbname1, comname1) com2 = dbt.db_connect_col(dbname2, comname2) fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad'] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days'] attr_names = ['uid', 'attr'] attr_names.extend(['u_'+field.split('.')[-1] for field in fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend(['fr_'+field.split('.')[-1] for field in fields]) attr_names.extend(['fr_'+field for field in prof_names]) attr_names.extend(['fr_num', 'fr_palive']) attr_names.extend(['fo_'+field.split('.')[-1] for field in fields]) attr_names.extend(['fo_'+field for field in prof_names]) attr_names.extend(['fo_num', 'fo_palive']) attr_names.extend(['co_'+field.split('.')[-1] for field in fields]) attr_names.extend(['co_'+field for field in prof_names]) attr_names.extend(['co_num', 'co_palive']) print attr_names attr_length = len(fields) + len(prof_names) + 2 network1 = gt.load_network(dbname1, 'net') data = [] for uid in user1: row = [uid] u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if u2 is None or u2['timeline_count'] == 0: row.append(1) else: row.append(0) uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) row.extend(active_days(u1)) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: print '--------------------user %d---------------' %uid followees = set([int(network1.vs[v]['name']) for v in network1.successors(str(uid))]) followers = set([int(network1.vs[v]['name']) for v in network1.predecessors(str(uid))]) common = followees.intersection(followers) followees = followees - common followers = followers - common for friend_ids in [followees, followers, common]: if len(friend_ids) > 0: # friend_ids = [int(network1.vs[v]['name']) for v in friends] print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu != None: fatt = iot.get_fields_one_doc(fu, fields) # Friends' LIWC fatt.extend(active_days(fu)) fatts.append(fatt) if fu2 is None or fu2['timeline_count'] == 0: alive += 0 else: alive += 1 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) else: row.extend([None] * attr_length) # friends = followers # followers # if len(friends) > 0: # friend_ids = [int(network1.vs[v]['name']) for v in friends] # print uid in friend_ids # print len(friend_ids) # fatts = [] # alive = 0 # for fid in friend_ids: # fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) # fu2 = com2.find_one({'id': fid}) # if fu != None: # fatt = iot.get_fields_one_doc(fu, fields) # fatt.extend(active_days(fu)) # fatts.append(fatt) # if fu2 is None or fu2['timeline_count'] == 0: # alive += 0 # else: # alive += 1 # if len(fatts) > 0: # fatts = np.array(fatts) # fmatts = np.mean(fatts, axis=0) # row.extend(fmatts) # row.append(len(fatts)) # paliv = float(alive)/len(fatts) # print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) # row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv('data-attr-split.csv', index = False)
def emotion_dropout_IV_combine(dbname1, dbname2, comname1, comname2): ''' Combine followees and follower together as variables :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}} user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) com1 = dbt.db_connect_col(dbname1, comname1) com2 = dbt.db_connect_col(dbname2, comname2) fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad'] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days'] attr_names = ['uid', 'attr'] attr_names.extend(['u_'+field.split('.')[-1] for field in fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend(['f_'+field.split('.')[-1] for field in fields]) attr_names.extend(['f_'+field for field in prof_names]) attr_names.extend(['f_num', 'f_palive']) print attr_names network1 = gt.load_network(dbname1, 'net') data = [] for uid in user1: row = [uid] u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if u2 is None or u2['timeline_count'] == 0: row.append(1) else: row.append(0) uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) row.extend(active_days(u1)) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: friends = set(network1.neighbors(str(uid))) # id or name if len(friends) > 0: friend_ids = [int(network1.vs[v]['name']) for v in friends] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu != None: fatt = iot.get_fields_one_doc(fu, fields) fatt.extend(active_days(fu)) fatts.append(fatt) if fu2 is None or fu2['timeline_count'] == 0: alive += 0 else: alive += 1 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv('data-attr-combine.csv', index = False)
def emotion_dropout_IV_split(dbname1, dbname2, comname1, comname2): ''' Split followees and followers as different variables :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}} user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) com1 = dbt.db_connect_col(dbname1, comname1) com2 = dbt.db_connect_col(dbname2, comname2) fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days', 'eigenvector'] attr_names = ['uid', 'attr'] attr_names.extend(['u_'+field.split('.')[-1] for field in fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend(['fr_'+field.split('.')[-1] for field in fields]) attr_names.extend(['fr_'+field for field in prof_names]) attr_names.extend(['fr_num', 'fr_palive']) attr_names.extend(['fo_'+field.split('.')[-1] for field in fields]) attr_names.extend(['fo_'+field for field in prof_names]) attr_names.extend(['fo_num', 'fo_palive']) attr_names.extend(['co_'+field.split('.')[-1] for field in fields]) attr_names.extend(['co_'+field for field in prof_names]) attr_names.extend(['co_num', 'co_palive']) print attr_names attr_length = len(fields) + len(prof_names) + 2 network1 = gt.load_network(dbname1, 'net') '''Centralities Calculation''' eigen = network1.eigenvector_centrality() # closeness = network1.closeness() # betweenness = network1.betweenness() nodes = [int(v['name']) for v in network1.vs] eigen_map = dict(zip(nodes, eigen)) # closeness_map = dict(zip(nodes, closeness)) # betweenness_map = dict(zip(nodes, betweenness)) data = [] for uid in user1: row = [uid] u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if u2 is None or u2['timeline_count'] == 0: row.append(1) else: row.append(0) uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) row.extend(active_days(u1)) row.extend([eigen_map.get(u1['id'])]) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: print '--------------------user %d---------------' %uid followees = set([int(network1.vs[v]['name']) for v in network1.successors(str(uid))]) followers = set([int(network1.vs[v]['name']) for v in network1.predecessors(str(uid))]) common = followees.intersection(followers) followees = followees - common followers = followers - common for friend_ids in [followees, followers, common]: if len(friend_ids) > 0: # friend_ids = [int(network1.vs[v]['name']) for v in friends] print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu != None: fatt = iot.get_fields_one_doc(fu, fields) # Friends' LIWC fatt.extend(active_days(fu)) fatt.extend([eigen_map.get(fu['id'])]) fatts.append(fatt) if fu2 is None or fu2['timeline_count'] == 0: alive += 0 else: alive += 1 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) else: row.extend([None] * attr_length) # friends = followers # followers # if len(friends) > 0: # friend_ids = [int(network1.vs[v]['name']) for v in friends] # print uid in friend_ids # print len(friend_ids) # fatts = [] # alive = 0 # for fid in friend_ids: # fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) # fu2 = com2.find_one({'id': fid}) # if fu != None: # fatt = iot.get_fields_one_doc(fu, fields) # fatt.extend(active_days(fu)) # fatts.append(fatt) # if fu2 is None or fu2['timeline_count'] == 0: # alive += 0 # else: # alive += 1 # if len(fatts) > 0: # fatts = np.array(fatts) # fmatts = np.mean(fatts, axis=0) # row.extend(fmatts) # row.append(len(fatts)) # paliv = float(alive)/len(fatts) # print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) # row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv('data-attr-split.csv', index = False)
def read_user_time_iv(filename): # fields = iot.read_fields() fields = [ 'liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.ingest', 'liwc_anal.result.bio', 'liwc_anal.result.body', 'liwc_anal.result.health', 'liwc_anal.result.death' 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad' ] prof_names = [ 'friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days' ] trimed_fields = [field.split('.')[-1] for field in fields] groups = [('ED', 'fed', 'com', 'fed_sur', 'com', '2017-06-21 14:57:39+00:00', { 'liwc_anal.result.WC': { '$exists': True }, 'level': 1 }), ('RD', 'random', 'scom', 'random_sur', 'com', '2017-06-21 14:57:39+00:00', { 'liwc_anal.result.WC': { '$exists': True } }), ('YG', 'younger', 'scom', 'younger_sur', 'com', '2017-06-21 14:57:39+00:00', { 'liwc_anal.result.WC': { '$exists': True } })] data = [] for tag, dbname, comname, dbname2, comname2, second_time, filter_values in groups: com = dbt.db_connect_col(dbname, comname) com2 = dbt.db_connect_col(dbname2, comname2) network1 = gt.Graph.Read_GraphML(tag.lower() + '-net.graphml') gt.summary(network1) network1_gc = gt.giant_component(network1) gt.summary(network1_gc) '''Centralities Calculation''' eigen = network1_gc.eigenvector_centrality() pageranks = network1_gc.pagerank() indegree = network1_gc.authority_score() outdegree = network1_gc.hub_score() nodes = [int(v['name']) for v in network1_gc.vs] eigen_map = dict(zip(nodes, eigen)) pagerank_map = dict(zip(nodes, pageranks)) indegree_map = dict(zip(nodes, indegree)) outdegree_map = dict(zip(nodes, outdegree)) print 'load liwc 2 batches: ' + tag.lower() + '-liwc2stage.csv' liwc_df = pd.read_pickle(tag.lower() + '-liwc2stage.csv' + '.pick') for user in com.find(filter_values, no_cursor_timeout=True): first_scraped_at = user['_id'].generation_time.replace(tzinfo=None) if 'status' in user: uid = user['id'] u2 = com2.find_one({'id': uid}) first_last_post = datetime.strptime( user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') last_post = first_last_post drop = 1 if u2: second_scraped_at = u2['_id'].generation_time.replace( tzinfo=None) if 'status' in u2: second_last_post = datetime.strptime( u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if first_scraped_at < second_last_post < second_scraped_at: drop = 0 last_post = second_last_post created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y') life_time = diff_day(last_post, created_at) average_time = float(life_time) / min(1, user['statuses_count']) longest_tweet_intervalb = user['longest_tweet_interval'] u_timeline_count = user['timeline_count'] values = iot.get_fields_one_doc(user, fields) level = user['level'] # set users liwc changes uvs = liwc_df[liwc_df.user_id == str(uid)].loc[:, trimed_fields] # print uvs if len(uvs) == 2: changes, priors, posts = [], [], [] for name in trimed_fields: old = uvs.iloc[0][name] new = uvs.iloc[1][name] priors.append(old) posts.append(new) changes.append(new - old) liwc_changes = priors + posts + changes else: liwc_changes = [None] * (len(trimed_fields) * 3) u_centrality = eigen_map.get(user['id'], 0) u_pagerank = pagerank_map.get(user['id'], 0) u_indegree = indegree_map.get(user['id'], 0) u_outdegree = outdegree_map.get(user['id'], 0) values.extend(liwc_changes) values.extend(active_days(user)) '''Get friends' profiles''' exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: # friends = set(network1.neighbors(str(uid))) # id or name friends = set(network1.successors(str(uid))) if len(friends) > 0: friend_ids = [ int(network1.vs[vi]['name']) for vi in friends ] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com.find_one({ 'id': fid, 'liwc_anal.result.WC': { '$exists': True } }) fu2 = com2.find_one({'id': fid}) if fu: f1_time = fu['_id'].generation_time.replace( tzinfo=None) # if eigen_map.get(fu['id'], 0) > 0.0001: if True: fatt = iot.get_fields_one_doc(fu, fields) factive = active_days(fu) if fu2: f2_time = fu2[ '_id'].generation_time.replace( tzinfo=None) if 'status' in fu2: fsecond_last_post = datetime.strptime( fu2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if f1_time < fsecond_last_post < f2_time: alive += 1 factive = active_days(fu2) fatt.extend(factive) fatt.extend([ eigen_map.get(fu['id'], 0), pagerank_map.get(fu['id'], 0), indegree_map.get(fu['id'], 0), outdegree_map.get(fu['id'], 0) ]) fatts.append(fatt) # thredhold = user['friends_count']*0.5 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) values.extend(fmatts) paliv = float(alive) / len(fatts) data.append([ user['id_str'], level, drop, created_at, first_last_post, second_last_post, last_post, first_scraped_at, second_scraped_at, average_time, longest_tweet_intervalb, tag, u_centrality, u_pagerank, u_indegree, u_outdegree, u_timeline_count ] + values + [len(fatts), paliv]) df = pd.DataFrame( data, columns=[ 'uid', 'level', 'dropout', 'created_at', 'first_last_post', 'second_last_post', 'last_post', 'first_scraped_at', 'second_scraped_at', 'average_time', 'longest_time_interval', 'group', 'u_eigenvector', 'u_pagerank', 'u_authority', 'u_hub', 'u_timeline_count' ] + ['u_' + field for field in trimed_fields] + ['u_prior_' + field for field in trimed_fields] + ['u_post_' + field for field in trimed_fields] + ['u_change_' + field for field in trimed_fields] + ['u_' + field for field in prof_names] + ['f_' + tf for tf in trimed_fields] + ['f_' + field for field in prof_names] + [ 'f_eigenvector', 'f_pagerank', 'f_authority', 'f_hub', 'f_num', 'f_palive' ]) df.to_csv(filename)
def emotion_dropout_IV_following(filepath): ''' Only use following stats :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' fields = [ 'senti.result.whole.posm', 'senti.result.whole.posstd', 'senti.result.whole.negm', 'senti.result.whole.negstd', 'senti.result.whole.scalem', 'senti.result.whole.scalestd', 'senti.result.whole.N', 'senti.result.prior.scalem', 'senti.result.post.scalem', # 'liwc_anal.result.posemo', # 'liwc_anal.result.negemo', # 'liwc_anal.result.ingest', # 'liwc_anal.result.bio', # 'liwc_anal.result.body', # 'liwc_anal.result.health', # 'liwc_anal.result.death' # 'liwc_anal.result.anx', # 'liwc_anal.result.anger', # 'liwc_anal.result.sad' ] trimed_fields = ['-'.join(field.split('.')[-2: -1]) for field in fields] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days', 'eigenvector', 'pagerank', 'authority', 'hub'] attr_names = ['uid', 'group', 'attr', 'level'] attr_names.extend(['u_'+field for field in trimed_fields]) # attr_names.extend(['u_prior_'+field for field in trimed_fields]) # attr_names.extend(['u_post_'+field for field in trimed_fields]) # attr_names.extend(['u_change_'+field for field in trimed_fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend([ # 'u_recovery_tweets', 'u_timeline_count']) attr_names.extend(['f_'+field.split('.')[-1] for field in fields]) attr_names.extend(['f_'+field for field in prof_names]) attr_names.extend(['f_timeline_count', 'f_num', 'f_palive']) print attr_names data = [] name_map = { 'ed': ('fed', 'fed_sur', 'com', 'com', {'level': 1, 'liwc_anal.result.WC': {'$exists': True}}), 'yg': ('younger', 'younger_sur', 'scom', 'com', {'liwc_anal.result.WC': {'$exists': True}}), 'rd': ('random', 'random_sur', 'scom', 'com', {'liwc_anal.result.WC': {'$exists': True}}) } for groupname in [ 'yg', 'rd', 'ed']: dbname1, dbname2, comname1, comname2, filter_que = name_map[groupname] print 'Centrality Calculate .........' # users = iot.get_values_one_field('fed', 'com', 'id', {'level': {'$lt': 3}}) # print 'Number of users', len(users) # network1 = gt.load_network_subset('fed', 'net', {'user': {'$in': users}, 'follower': {'$in': users}}) # network1 = gt.load_network('fed', 'net') # pickle.dump(network1, open('net.pick', 'w')) print 'load network: ' + groupname+'-net.graphml' network1= gt.Graph.Read_GraphML(groupname+'-net.graphml') # network1 = pickle.load(open('net.pick', 'r')) gt.summary(network1) network1_gc = gt.giant_component(network1) gt.summary(network1_gc) '''Centralities Calculation''' eigen = network1_gc.eigenvector_centrality() pageranks = network1_gc.pagerank() indegree = network1_gc.authority_score() outdegree = network1_gc.hub_score() # closeness = network.closeness() # betweenness = network.betweenness() # print len(eigen), len(closeness), len(betweenness) nodes = [int(v['name']) for v in network1_gc.vs] # print len(nodes), len(eigen) # print type(nodes), type(eigen) eigen_map = dict(zip(nodes, eigen)) pagerank_map = dict(zip(nodes, pageranks)) indegree_map = dict(zip(nodes, indegree)) outdegree_map = dict(zip(nodes, outdegree)) # print eigen_map.get(nodes[1]), type(eigen_map.get(nodes[1])) # closeness_map = dict(zip(nodes, closeness)) # betweenness_map = dict(zip(nodes, betweenness)) print 'Centrality Calculate .........' # print 'load liwc 2 batches: ' + groupname+'-liwc2stage.csv' # df = pd.read_pickle(groupname+'-liwc2stage.csv'+'.pick') user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) print 'load db1: ', dbname1, comname1 com1 = dbt.db_connect_col(dbname1, comname1) print 'load db2: ', dbname2, comname2 com2 = dbt.db_connect_col(dbname2, comname2) for uid in user1: # set uid row = [uid, groupname] # set attrition states u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) u1_time = u1['_id'].generation_time.replace(tzinfo=None) # if u2 is None or u2['timeline_count'] == 0: drop = 1 if u2: u2_time = u2['_id'].generation_time.replace(tzinfo=None) if 'status' in u2: second_last_post = datetime.strptime(u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if u1_time < second_last_post < u2_time: drop = 0 row.append(drop) row.append(u1['level']) # set users liwc feature uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) # # set users liwc changes # uvs = df[df.user_id == str(uid)].loc[:, trimed_fields] # # print uvs # if len(uvs) == 2: # changes, priors, posts = [], [], [] # for name in trimed_fields: # old = uvs.iloc[0][name] # new = uvs.iloc[1][name] # priors.append(old) # posts.append(new) # changes.append(new - old) # row.extend(priors) # row.extend(posts) # row.extend(changes) # else: # row.extend([None]*(len(trimed_fields)*3)) # set profile, active days and eigenvector centrality print u1['id'] row.extend(active_days(u1)) row.extend([eigen_map.get(u1['id'], 0)]) row.extend([pagerank_map.get(u1['id'], 0)]) row.extend([indegree_map.get(u1['id'], 0)]) row.extend([outdegree_map.get(u1['id'], 0)]) row.extend([ # u1['recovery_tweets'], u1['timeline_count']]) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: # friends = set(network1.neighbors(str(uid))) # id or name friends = set(network1.successors(str(uid))) if len(friends) > 0: friend_ids = [int(network1.vs[vi]['name']) for vi in friends] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu: f1_time = fu['_id'].generation_time.replace(tzinfo=None) # if eigen_map.get(fu['id'], 0) > 0.0001: if True: fatt = iot.get_fields_one_doc(fu, fields) factive = active_days(fu) if fu2: f2_time = fu2['_id'].generation_time.replace(tzinfo=None) if 'status' in fu2: fsecond_last_post = datetime.strptime(fu2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if f1_time < fsecond_last_post < f2_time: alive += 1 factive = active_days(fu2) fatt.extend(factive) fatt.extend([eigen_map.get(fu['id'], 0)]) fatt.extend([pagerank_map.get(fu['id'], 0)]) fatt.extend([indegree_map.get(fu['id'], 0)]) fatt.extend([outdegree_map.get(fu['id'], 0)]) fatt.extend([fu['timeline_count']]) fatts.append(fatt) if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv(filepath, index = False)
def data_split(dbname='TwitterProAna', colname='tweets'): # # https://stackoverflow.com/questions/8136652/query-mongodb-on-month-day-year-of-a-datetime # # Label tweets with dates # tweets = dbt.db_connect_col(dbname, colname) # # basedate = datetime(1970, 1, 1) # # tweets.create_index([('date_week', pymongo.ASCENDING)]) # # for tweet in tweets.find({}, no_cursor_timeout=True): # # creat = tweet['created_at'] # # detal = creat - basedate # # datestr = detal.days // 7 + 1 # # tweets.update_one({'id': tweet['id']}, {'$set': {"date_week": datestr}}, upsert=False) # # # # Indexing tweets with dates # date_index = {} # for tweet in tweets.find({}, ['id', 'date_week'], no_cursor_timeout=True): # tid, date = tweet['id'], tweet['date_week'] # tlist = date_index.get(date, []) # tlist.append(tid) # date_index[date] = tlist # pickle.dump(date_index, open('date_tid_list_week.pick', 'w')) # # # Bunch with tweets in give dates to produce LIWC results # # tweets = dbt.db_connect_col(dbname, colname) # # date_index = pickle.load(open('date_tid_list_week.pick', 'r')) # timeseries = dbt.db_connect_col(dbname, 'weekseries') # for key in date_index.keys(): # tlist = date_index[key] # textmass = '' # for tid in tlist: # tweet = tweets.find_one({'id': tid}) # text = tweet['text'].encode('utf8') # # replace RT, @, # and Http:// # match = rtgrex.search(text) # if match is None: # text = mgrex.sub('', text) # text = hgrex.sub('', text) # text = ugrex.sub('', text) # text = text.strip() # if not(text.endswith('.') or text.endswith('?') or text.endswith('!')): # text += '.' # textmass += " " + text.lower() # words = textmass.split() # # Any text with fewer than 50 words should be looked at with a certain degree of skepticism. # if len(words) > 50: # liwc_result = liwc.summarize_document(' '.join(words)) # timeseries.insert({'date': key, 'liwc':liwc_result}) timeseries = dbt.db_connect_col(dbname, 'weekseries') fields = iot.read_fields() fields_trim = [f.replace('liwc_anal.result.', '') for f in fields] fields = [f.replace('_anal.result', '') for f in fields] print len(fields) data = [] basedate = datetime(1970, 1, 1) for entry in timeseries.find(): time = entry['date'] # date = datetime.strptime(time, '%Y-%m') # date = datetime.date(year=int(time[0]), month=int(time[1])) # detal = creat - basedate # # datestr = detal.days // 7 + 1 days = (time -1)*7 date = basedate + datetime.timedelta(days=days) features = iot.get_fields_one_doc(entry, fields) data.append([date] + features) df = pd.DataFrame(data=data, columns=['date'] + fields_trim) df.to_csv('ian-liwc-tweets-week.csv')