def label_dropout_network(g_file, db1n, com1n, db2n, com2n): g = gt.Graph.Read_GraphML(g_file) allg = gt.Graph.Read_GraphML('fed-net.graphml') allg.vs['hub'] = allg.eigenvector_centrality() com1 = dbt.db_connect_col(db1n, com1n) com2 = dbt.db_connect_col(db2n, com2n) labels, hubs = [], [] for v in g.vs: uid = int(v['name']) hub = allg.vs.find(name=v['name'])['hub'] print hub u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if (u2 is None): # protected or delete drop = 1 else: if 'status' not in u1 and 'status' not in u2: # no tweeting drop = 1 elif 'status' not in u1 and 'status' in u2: # start to post drop = 0 elif 'status' in u1 and 'status' not in u2: # delete drop = 0 elif u2['status']['id'] == u1['status']['id']: # no new post drop = 1 elif u2['status']['id'] != u1['status']['id']: # new post drop = 0 labels.append(drop) hubs.append(hub) g.vs['drop'] = labels g.vs['cen'] = hubs g.write_graphml('drop-'+g_file)
def count_longest_tweeting_period(dbname, timename, comname): # get users' latest 10 tweets, and calculate the largest posting interval, counted by days. com = dbt.db_connect_col(dbname, comname) time = dbt.db_connect_col(dbname, timename) for user in com.find({'liwc_anal.result.WC': { '$exists': True }}, no_cursor_timeout=True): user_id = user['id'] datas = [] for tweet in time.find({ 'user.id': user_id }, { 'id': 1, 'created_at': 1 }).sort([('id', -1) ]).limit(10): # sort: 1 = ascending, -1 = descending created_at = datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') datas.append(created_at) # print user['id'] # print datas diff = [((datas[i] - datas[i + 1]).days) for i in xrange(len(datas) - 1)] max_period = max(diff) # print max_period com.update({'id': user_id}, {'$set': { 'longest_tweet_interval': max_period }}, upsert=False)
def recovery_users_tweet(): # gather recovery/treat related tweets # When construct control group, if they have retweet treatment, delete them com = dbt.db_connect_col('fed', 'scom') times = dbt.db_connect_col('fed', 'timeline') newtime = dbt.db_connect_col('fed', 'recover') newtime.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) newtime.create_index([('id', pymongo.ASCENDING)], unique=True) for user in com.find(no_cursor_timeout=True): uid = user['id'] for tweet in times.find({'user.id': uid}): # if 'retweeted_status' in tweet: # continue # elif 'quoted_status' in tweet: # continue # else: text = tweet['text'].encode('utf8') text = re.sub(r"(?:(RT\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// # if ('I' in text or ' me ' in text): text = text.strip().lower() if 'recover' in text or 'treatment' in text or 'therap' in text \ or 'doctor' in text: # or 'healing' in text or 'therapy' in text or 'doctor' in text or 'hospital' in text: # print ' '.join(tweet['text'].split()) try: newtime.insert(tweet) except pymongo.errors.DuplicateKeyError: pass
def bio_information(dbname='TwitterProAna', colname='users'): com = dbt.db_connect_col(dbname, colname) bio_hist = dbt.db_connect_col(dbname, 'bio') bio_hist.create_index([('id', pymongo.ASCENDING)]) for row in com.find({'screen_name': {'$exists': True}}, no_cursor_timeout=True): name, text = row['name'], row['description'] date = row['lastPolledFull'] if text and name: stats = dm.process_text(text, name) elif text: stats = dm.process_text(text) if stats: stats['date'] = date stats['id'] = row['id'] try: bio_hist.insert(stats) except pymongo.errors.DuplicateKeyError: pass for hist in reversed(row['history']): if 'name' in hist: name = hist['name'] if 'description' in hist: text = hist['description'] if text: stats = dm.process_text(text, name) if stats: stats['date'] = hist['lastPolledFull'] stats['id'] = row['id'] try: bio_hist.insert(stats) except pymongo.errors.DuplicateKeyError: pass
def split_treatment(): rec, proed = edrelatedcom.rec_proed() ## based on profiles times = dbt.db_connect_col('fed', 'timeline') prior = dbt.db_connect_col('fed', 'prior_treat') prior.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) prior.create_index([('id', pymongo.ASCENDING)], unique=True) post = dbt.db_connect_col('fed', 'post_treat') post.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) post.create_index([('id', pymongo.ASCENDING)], unique=True) for user in rec: Find = False for tweet in times.find({'user.id': int(user)}).sort([('id', 1)]): # sort: 1 = ascending, -1 = descending if ('retweeted_status' not in tweet) and ('quoted_status' not in tweet): text = tweet['text'].encode('utf8') text = re.sub(r"(?:(RT\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// text = text.strip().lower() if 'treatment' in text or 'therap' in text \ or 'doctor' in text: Find = True if Find: post.insert(tweet) else: prior.insert(tweet)
def user_active(): # obtain the active duration of users in two observation groups = [ ('ED', 'fed', 'com', 'fed', 'com_survival', { 'liwc_anal.result.WC': {'$exists': True}, 'level': 1, 'senti.result.whole.N': {'$gt': 10}}), ('RD', 'random', 'scom', 'random', 'com_survival', { 'liwc_anal.result.WC': {'$exists': True}, 'senti.result.whole.N': {'$gt': 10}}), ('YG', 'younger', 'scom', 'younger', 'com_survival', { 'liwc_anal.result.WC': {'$exists': True}, 'senti.result.whole.N': {'$gt': 10}}) ] for tag, dbname, comname, dbname2, comname2, filter_values in groups: com = dbt.db_connect_col(dbname, comname) com2 = dbt.db_connect_col(dbname2, comname2) network1 = gt.Graph.Read_GraphML(tag.lower()+'-net-all.graphml') network1.vs['alive'] = 0 network1.vs['duration'] = 0 for v in network1.vs: uid = int(v['name']) u1 = com.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if u1: f1_time = u1['_id'].generation_time.replace(tzinfo=None) if u2: f2_time = u2['_id'].generation_time.replace(tzinfo=None) if 'status' in u2: fsecond_last_post = datetime.strptime(u2['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') if f1_time < fsecond_last_post < f2_time: v['alive'] = 1 v['duration'] = friends_active_days(u2, f1_time)[0] network1.write_graphml(tag.lower()+'-net-all-active.graphml')
def split_control(): ## the mean split point of treatment are 0.330912888352 times = dbt.db_connect_col('fed', 'timeline') control = dbt.db_connect_col('fed', 'control_com') prior = dbt.db_connect_col('fed', 'prior_control') prior.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) prior.create_index([('id', pymongo.ASCENDING)], unique=True) post = dbt.db_connect_col('fed', 'post_control') post.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) post.create_index([('id', pymongo.ASCENDING)], unique=True) for user in control.find(no_cursor_timeout=True): timline_count = user['timeline_count'] cut = int(timline_count * 0.33) count = 0 for tweet in times.find({'user.id': user['id']}).sort([('id', 1)]): # sort: 1 = ascending, -1 = descending if count < cut: try: prior.insert(tweet) except pymongo.errors.DuplicateKeyError: pass else: try: post.insert(tweet) except pymongo.errors.DuplicateKeyError: pass count += 1
def refine_recovery_tweets(hash_com, tagcol, refine_tagcol, idx=[4, 58 ]): # without non-recovery: 18, 102, 4, 58, 88 # select tweets have ed-related hashtags times = dbt.db_connect_col('fed', tagcol) rec_refine = dbt.db_connect_col('fed', refine_tagcol) rec_refine.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) rec_refine.create_index([('id', pymongo.ASCENDING)], unique=True) for tweet in times.find(): hashtags = tweet['entities']['hashtags'] for hash in hashtags: # need no .encode('utf-8') tag = hash['text'].encode('utf-8').lower().replace('_', '').replace( '-', '') com_id = hash_com.get(tag, -1) if com_id > -1: if com_id in idx: try: rec_refine.insert(tweet) except pymongo.errors.DuplicateKeyError: pass
def label_ed_recovery(hash_com, com_size, idx=[18, 102]): # select users in prorec that have more ed-related hashtags times = dbt.db_connect_col('fed', 'prorec_tag') com = dbt.db_connect_col('fed', 'tag_com') threshold = float(sum([com_size[i] for i in idx])) / sum(com_size.values()) print 'threshold: ', threshold users = list(set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id'))) for uid in users: taget_count, all_count = 0.0, 0.0 for tweet in times.find({'user.id': uid}): hashtags = tweet['entities']['hashtags'] hash_set = set() for hash in hashtags: # need no .encode('utf-8') hash_set.add(hash['text'].encode('utf-8').lower().replace( '_', '').replace('-', '')) for tag in hash_set: com_id = hash_com.get(tag, -1) if com_id > -1: all_count += 1 if com_id in idx: taget_count += 1 if all_count and taget_count / all_count > threshold: com.update({'id': uid}, {'$set': { 'rec_tageted': True }}, upsert=False)
def extract_user(dbname='ed', stream='restream', user='******'): # extract users from tweet stream, including author and retweeters. stream = dbt.db_connect_col(dbname, stream) com = dbt.db_connect_col(dbname, user) com.create_index("id", unique=True) for tweet in stream.find({ 'userextract': { '$exists': False }, }, no_cursor_timeout=True): author = tweet['user'] author['level'] = 1 try: com.insert(author) except pymongo.errors.DuplicateKeyError: pass if 'retweeted_status' in tweet: retweetee = tweet['retweeted_status']['user'] retweetee['level'] = 1 try: com.insert(retweetee) except pymongo.errors.DuplicateKeyError: pass stream.update_one({'id': tweet['id']}, {'$set': { 'userextract': True }}, upsert=False)
def recollect_ed(dbname='ed', colname='stream', newcol='restream'): # Recollect the stream data, to get the favorite and retweet counts stream = dbt.db_connect_col(dbname, colname) newstream = dbt.db_connect_col(dbname, newcol) newstream.create_index("id", unique=True) i = 0 ids = [] for tweet in stream.find({ 'recollected': { '$exists': False }, }, no_cursor_timeout=True): if i < 100: stream.update_one({'id': tweet['id']}, {'$set': { 'recollected': True }}, upsert=False) ids.append(tweet['id']) i += 1 else: print datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + str(len(ids)) tweets = tlup.get_tweets_info(ids) for t in tweets: try: newstream.insert(t) except pymongo.errors.DuplicateKeyError: pass i = 0 ids = []
def filter_recovery_sentiment(): user_count, user_pol = {}, {} times = dbt.db_connect_col('fed', 'recovery') for tweet in times.find(): uid = tweet['user']['id'] pol = tweet['polarity'] count = user_count.get(uid, 0.0) polv = user_pol.get(uid, 0.0) user_count[uid] = count + 1 if pol > 0: print ' '.join(tweet['text'].split()) user_pol[uid] = polv + 1 elif pol < 0: user_pol[uid] = polv - 1 else: user_pol[uid] = polv + 0 user_list = [k for k, v in user_count.items() if v >= 3] print sum(user_pol[uid] > 0 for uid in user_list) print sum(user_pol[uid] < 0 for uid in user_list) rec, nonrec = [], [] com = dbt.db_connect_col('fed', 'scom') for uid in user_list: if user_pol[uid] > 0: rec.append(str(uid)) user = com.find_one({'id':uid}) print 'Positive', user['id_str'], user['screen_name'], ' '.join(user['description'].split()).encode('utf-8') elif user_pol[uid] < 0: nonrec.append(str(uid)) user = com.find_one({'id':uid}) print 'Negative', user['id_str'], user['screen_name'], ' '.join(user['description'].split()).encode('utf-8') return rec, nonrec
def ed_hashtag(): # Filter ED related tweets dbname = 'fed' # select recovery users based on hashtags # Store in the databases # com = dbt.db_connect_col('fed', 'com') ed_tags = set(iot.read_ed_hashtags()) print ed_tags times = dbt.db_connect_col(dbname, 'timeline') taged = dbt.db_connect_col(dbname, 'ed_tag') taged.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) taged.create_index([('id', pymongo.ASCENDING)], unique=True) # for user in com.find(): # for tweet in times.find({'user.id': user['id'], '$where': 'this.entities.hashtags.length>0'}): for tweet in times.find({'$where': 'this.entities.hashtags.length>0'}, no_cursor_timeout=True): hashtags = tweet['entities']['hashtags'] for hash in hashtags: value = hash['text'].encode('utf-8').lower().replace('_', '').replace('-', '') if value in ed_tags: try: taged.insert(tweet) except pymongo.errors.DuplicateKeyError: pass
def trandb(dbname, colnam1, colnam2): time1 = dbt.db_connect_col(dbname, colnam1) time2 = dbt.db_connect_col(dbname, colnam2) for t in time2.find(): try: time1.insert(t) except pymongo.errors.DuplicateKeyError: pass
def control_users(): com = dbt.db_connect_col('fed', 'scom') recovery_user = set(iot.get_values_one_field('fed', 'recover', 'user.id')) control_com = dbt.db_connect_col('fed', 'control_com') control_com.create_index("id", unique=True) for user in com.find(): if user['id'] not in recovery_user: control_com.insert(user)
def filter_ed_tweets(): # Filter ed-related tweets, based on word2vec from ohsn.edrelated import edrelatedcom prorec = edrelatedcom.rec_user('fed', 'scom') proed = edrelatedcom.proed_users('fed', 'scom') # com = dbt.db_connect_col('fed', 'scom') times = dbt.db_connect_col('fed', 'timeline') ed_times = dbt.db_connect_col('fed', 'edtimeline') ed_times.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) ed_times.create_index([('id', pymongo.ASCENDING)], unique=True) ed_list = set([ 'bmi', 'cw', 'ugw', 'gw', 'lbs', 'hw', 'lw', 'kg', 'ed', 'eatingdisorder', 'anorexia', 'bulimia', 'anorexic', 'ana', 'bulimic', 'anorexia', 'mia', 'thinspo', 'bulemia', 'purge', 'bulimia', 'binge', 'selfharm', 'ednos', 'edprobs', 'edprob', 'proana', 'anamia', 'promia', 'askanamia', 'bonespo', 'legspo' ]) model = models.word2vec.Word2Vec.load('data/word2vec') # Rake = RAKE.Rake('/home/wt/Code/ohsn/ohsn/networkminer/stoplist/SmartStoplist.txt') import ohsn.api.profiles_check as pc print len(prorec + proed) for user in prorec + proed: for tweet in times.find({'user.id': int(user)}): text = tweet['text'].encode('utf8') # replace RT, @, # and Http:// # text = text.strip().lower() # text = re.sub(r"(?:(rt\ ?@)|@|https?://)\S+", "", text) # replace RT @, @ and http:// # keywords = Rake.run(text) keywords = pc.tokenizer_stoprm(text) sumsim = 0.0 count = 0 # for word in keywords: # tokens = word[0].split() # sima, ca = 0.0, 0.0 # for token in tokens: # if token in model: # for ed in ed_list: # sim = model.similarity(token, ed) # # if sim > maxsim: # sima += sim # ca += 1 # if ca != 0: # sumsim += sima/ca # count += 1 for word in keywords: if word in model: for ed in ed_list: sim = model.similarity(word, ed) sumsim += sim count += 1 if count != 0 and (sumsim / count ) > 0.26: # the average similarity of ed words try: ed_times.insert(tweet) except pymongo.errors.DuplicateKeyError: pass
def tweet_difference(dbname='fed', comname='scom', timename='timeline'): # Calcuate the LIWC features of tweets that are retweeted or favorited com = dbt.db_connect_col(dbname, comname) times = dbt.db_connect_col(dbname, timename) '''Process the timelines of users in POI''' liwc = Liwc() for user in com.find(): print user['id'] textmass_retweet = '' textmass_like = '' # textmass_equal = '' for tweet in times.find({ 'user.id': user['id'], 'retweeted_status': { '$exists': False } }): retc = tweet['retweet_count'] favc = tweet["favorite_count"] text = tweet['text'].encode('utf8') text = rtgrex.sub('', text) text = mgrex.sub('', text) text = hgrex.sub('', text) text = ugrex.sub('', text) text = text.strip() if not (text.endswith('.') or text.endswith('?') or text.endswith('!')): text += '.' if retc > favc: textmass_retweet += " " + text.lower() if favc > retc: textmass_like += " " + text.lower() # else: # textmass_equal += " " + text.lower() textmass_retweet_words = textmass_retweet.split() textmass_like_words = textmass_like.split() # textmass_equal_words = textmass_equal.split() if len(textmass_retweet_words) > 50: liwc_result = liwc.summarize_document( ' '.join(textmass_retweet_words)) com.update_one({'id': user['id']}, { '$set': { 'retweet_liwc.mined': True, 'retweet_liwc.result': liwc_result } }, upsert=False) if len(textmass_like_words) > 50: liwc_result = liwc.summarize_document( ' '.join(textmass_like_words)) com.update_one({'id': user['id']}, { '$set': { 'like_liwc.mined': True, 'like_liwc.result': liwc_result } }, upsert=False)
def check_dumplice(dbname, timename, timename2): # add time1 into time2 time1 = dbt.db_connect_col(dbname, timename) times = dbt.db_connect_col(dbname, timename2) for tweet in time1.find({}): try: times.insert(tweet) except pymongo.errors.DuplicateKeyError: pass
def remove_random_users(dbname, comname, netname): com = dbt.db_connect_col(dbname, comname) users = iot.get_values_one_field(dbname, comname, 'id', {'level': 3}) net = dbt.db_connect_col(dbname, netname) for row in net.find(no_cursor_timeout=True): uid = row['user'] fid = row['follower'] if uid in users or fid in users: net.delete_one({'_id': row['_id']}) com.delete_many({'level': 3})
def ed_tweet_normal_tweet_count(): user_ids = set(iot.get_values_one_field('fed', 'ed_tag', 'user.id')) print len(user_ids) com = dbt.db_connect_col('fed', 'com') tags = dbt.db_connect_col('fed', 'ed_tag') data = [] for uid in user_ids: ed_count = tags.count({'user.id': uid}) all_count = com.find_one({'id': uid})['timeline_count'] data.append([uid, ed_count, all_count]) df = pd.DataFrame(data, columns=['id', 'ed_tweet_count', 'all_tweet_count']) df.to_csv('user-ed-stats.csv')
def ED_followee(): # put all ED's followees in follownet net = dbt.db_connect_col('fed', 'net2') users = set(iot.get_values_one_field('fed', 'scom', 'id')) print len(users) tem = dbt.db_connect_col('fed', 'follownet') for re in net.find(): if re['follower'] in users: try: tem.insert(re) except pymongo.errors.DuplicateKeyError: pass
def unique_tweet(dbname, streamname, timename): # get unique tweets in the stream stream = dbt.db_connect_col(dbname, streamname) time = dbt.db_connect_col(dbname, timename) time.create_index("id", unique=True) for tweet in stream.find({}, no_cursor_timeout=True): if 'retweeted_status' in tweet: text = tweet['retweeted_status']['text'] else: text = tweet['text'] try: time.insert({'id': tweet['id'], 'text': text}) except pymongo.errors.DuplicateKeyError: pass
def insert_cluster_tweets(dbname, timename, cluster): # insert tweets of different clusters into two database collections time = dbt.db_connect_col(dbname, timename) time.create_index([('user.id', pymongo.ASCENDING), ('id', pymongo.DESCENDING)]) time.create_index([('id', pymongo.ASCENDING)], unique=True) ed_tweet = dbt.db_connect_col(dbname, 'ed_tag') for uid in cluster: for tweet in ed_tweet.find({'user.id': int(uid)}): try: time.insert(tweet) except pymongo.errors.DuplicateKeyError: pass
def hashtag_users(): com = dbt.db_connect_col('fed', 'com') times_ped = list(set(iot.get_values_one_field('fed', 'proed_tag', 'user.id'))) times_rec = list(set(iot.get_values_one_field('fed', 'prorec_tag', 'user.id'))) newtime = dbt.db_connect_col('fed', 'tag_com') newtime.create_index([('id', pymongo.ASCENDING)], unique=True) for users in [times_ped, times_rec]: for uid in users: user = com.find_one({'id': uid}) try: newtime.insert(user) except pymongo.errors.DuplicateKeyError: pass
def random_network(): com = dbt.db_connect_col('random2', 'com') com1 = dbt.db_connect_col('random2', 'com2') # com2 = dbt.db_connect_col('random2', 'com2') # com3 = dbt.db_connect_col('random2', 'com3') com1.create_index("id", unique=True) # com2.create_index("id", unique=True) # com3.create_index("id", unique=True) for user in com.find({'level': 2}, no_cursor_timeout=True): # if user['level'] == 2: try: com1.insert(user) except pymongo.errors.DuplicateKeyError: pass com.delete_many({'level': 2})
def copy_com(dbname, com_ori_name, com_des_name): com_ori = dbt.db_connect_col(dbname, com_ori_name) com_des = dbt.db_connect_col(dbname, com_des_name) com_des.create_index("id", unique=True) com_des.create_index([('level', pymongo.ASCENDING), ('following_prelevel_node', pymongo.ASCENDING)], unique=False) com_des.create_index([('level', pymongo.ASCENDING), ('follower_prelevel_node', pymongo.ASCENDING)], unique=False) for user in com_ori.find({'level': {'$lt': 3}}, no_cursor_timeout=True): try: com_des.insert(user) except pymongo.errors.DuplicateKeyError: pass
def friend_network_hashtag_weight(dbname, netname): ''' Community detection on friendship network, weighted by hashtag similarity :param dbname: :param netname: :param user_hash_profile: :return: ''' user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r')) net = gt.load_network(dbname, netname) fields = iot.read_fields() com = dbt.db_connect_col(dbname, 'scom') for edge in net.es: source_vertex_id = edge.source target_vertex_id = edge.target source_uid = int(net.vs[source_vertex_id]['name']) target_uid = int(net.vs[target_vertex_id]['name']) source_user = com.find_one({'id':source_uid}) target_user = com.find_one({'id':target_uid}) source_user_liwc = iot.get_fields_one_doc(source_user, fields) target_user_liwc = iot.get_fields_one_doc(target_user, fields) source_user_liwc.extend(user_hash_profile[source_uid]) target_user_liwc.extend(user_hash_profile[target_uid]) print len(target_user_liwc) dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc) edge['weight'] = 1.0/(1.0 + dis) net.write_graphml('ed_weighted_follow.graphml')
def read_user_time(filename): fields = iot.read_fields() trimed_fields = [field.split('.')[-1] for field in fields] groups = [ ('ED', 'fed', 'com', {'liwc_anal.result.WC': {'$exists': True}, 'level': 1}), ('RD', 'random', 'scom', {'liwc_anal.result.WC': {'$exists': True}}), ('YG', 'younger', 'scom', {'liwc_anal.result.WC': {'$exists': True}}) ] data = [] for tag, dbname, comname, filter_values in groups: com = dbt.db_connect_col(dbname, comname) for user in com.find(filter_values, no_cursor_timeout=True): if 'status' in user: created_at = datetime.strptime(user['created_at'], '%a %b %d %H:%M:%S +0000 %Y') scraped_at = user['scrape_timeline_at'] last_post = datetime.strptime(user['status']['created_at'], '%a %b %d %H:%M:%S +0000 %Y') life_time = diff_day(last_post, created_at) average_time = float(life_time)/min(1, user['statuses_count']) longest_tweet_intervalb = user['longest_tweet_interval'] observation_interval = diff_day(scraped_at, last_post) if (observation_interval-longest_tweet_intervalb) > 30: death = 1 else: death = 0 values = iot.get_fields_one_doc(user, fields) data.append([user['id_str'], created_at, last_post, scraped_at, average_time, longest_tweet_intervalb, observation_interval, tag, death] + values) df = pd.DataFrame(data, columns=['uid', 'created_at', 'last_post', 'scraped_at', 'average_time', 'longest_time_interval', 'observation_interval', 'group', 'event'] + trimed_fields) df.to_csv(filename)
def recovery_tweet(): times = dbt.db_connect_col('fed', 'timeline') for tweet in times.find(): text = tweet['text'].encode('utf8') text = text.strip().lower().replace("-", "").replace('_', '') sentences = re.split( r"\s*[;:`\"()?!{}]\s*|--+|\s*-\s+|''|\.\s|\.$|\.\.+|¡°|¡±", text ) FLAG = False for sentence in sentences: if 'recover' in sentence: if 'not' not in sentence and 'don\'t' not in sentence and 'never' not in sentence \ and 'anti' not in sentence and 'non' not in sentence\ and 'relapse' not in sentence: FLAG = True # if 'struggl' in sentence: # if 'thin' not in sentence and 'weight' not in sentence \ # and 'mirror' not in sentence and 'figure' not in sentence \ # and 'food' not in sentence and 'body' not in sentence\ # and 'proed' not in sentence and 'proana' not in sentence and 'promia' not in sentence: # FLAG = True # if 'fight' in sentence: # if 'thin' not in sentence and 'weight' not in sentence \ # and 'mirror' not in sentence and 'figure' not in sentence \ # and 'food' not in sentence and 'body' not in sentence: # FLAG = True # for sentence in sentences: # if 'proed' in sentence or 'proana' in sentence or 'promia' in sentence: # if 'not' not in sentence and \ # 'don\'t' not in sentence and \ # 'anti' not in sentence: # FLAG = False if FLAG: print tweet['id'], ' '.join(tweet['text'].split()).encode('utf-8')
def friend_network_hashtag_weight(dbname, netname): ''' Community detection on friendship network, weighted by hashtag similarity :param dbname: :param netname: :param user_hash_profile: :return: ''' user_hash_profile = pickle.load(open('data/user-hash-profile.pick', 'r')) net = gt.load_network(dbname, netname) fields = iot.read_fields() com = dbt.db_connect_col(dbname, 'scom') for edge in net.es: source_vertex_id = edge.source target_vertex_id = edge.target source_uid = int(net.vs[source_vertex_id]['name']) target_uid = int(net.vs[target_vertex_id]['name']) source_user = com.find_one({'id': source_uid}) target_user = com.find_one({'id': target_uid}) source_user_liwc = iot.get_fields_one_doc(source_user, fields) target_user_liwc = iot.get_fields_one_doc(target_user, fields) source_user_liwc.extend(user_hash_profile[source_uid]) target_user_liwc.extend(user_hash_profile[target_uid]) print len(target_user_liwc) dis = spatial.distance.euclidean(source_user_liwc, target_user_liwc) edge['weight'] = 1.0 / (1.0 + dis) net.write_graphml('ed_weighted_follow.graphml')
def tag_jaccard(dbname, hash_time, gfilename): # Calculate the jaccard index of hashtag g = gt.Graph.Read_GraphML(gfilename + '_tag_undir.graphml') times = dbt.db_connect_col(dbname, hash_time) tag_tweets = {} for tweet in times.find({'$where': 'this.entities.hashtags.length>0'}): hashtags = tweet['entities']['hashtags'] hash_set = set() for hash in hashtags: hash_set.add(hash['text'].encode('utf-8').lower().replace( '_', '').replace('-', '')) for hash in hash_set: ids = tag_tweets.get(hash, set()) ids.add(tweet['id']) tag_tweets[hash] = ids pickle.dump(tag_tweets, open(gfilename + '.pick', 'w')) g.es['jaccard'] = 0.0 for edge in g.es: source_vertex_id = edge.source target_vertex_id = edge.target source_name = g.vs[source_vertex_id]['name'] target_name = g.vs[target_vertex_id]['name'] source_set, target_set = tag_tweets.get(source_name), tag_tweets.get( target_name) edge['jaccard'] = float(len( source_set.intersection(target_set))) / len( source_set.union(target_set)) g.write_graphml(gfilename + '_tag_undir_jarccard.graphml')
def hashtag_related_networks(dbname, timename, netname): ''' Extract users' behavior network for tweets that are related to hashtags of interests :param dbname: :param timename: :param netname: :return: ''' hashtags = iot.read_recovery_ed_keywords() timeline = dbutil.db_connect_col(dbname, timename) network = dbutil.db_connect_col(dbname, netname) network.create_index([("id0", pymongo.ASCENDING), ("id1", pymongo.ASCENDING), ("type", pymongo.ASCENDING), ("statusid", pymongo.ASCENDING)], unique=True) filter = {} filter['$and'] = [{'$where': 'this.entities.hashtags.length>0'}, {'$where': 'this.entities.user_mentions.length>0'}] for tweet in timeline.find(filter, no_cursor_timeout=True): tags = tweet['entities']['hashtags'] hash_tag_flag = False part = set([]) for tag in tags: tagv = tag['text'].encode('utf-8').lower().replace('_', '').replace('-', '') part.add(tagv) # if tagv in hashtags: hash_tag_flag = True if hash_tag_flag: # print tweet['text'] udmention_list = [] if ('retweeted_status' in tweet) and len(tweet['retweeted_status']['entities']['user_mentions'])>0: for udmention in tweet['retweeted_status']['entities']['user_mentions']: udmention_list.append(udmention['id']) for mention in tweet['entities']['user_mentions']: if ('in_reply_to_user_id' in tweet) and (mention['id'] == tweet['in_reply_to_user_id']): # reply add_reply_edge(network, tweet['user']['id'], tweet['in_reply_to_user_id'], tweet['created_at'], tweet['id'], list(part)) elif ('retweeted_status' in tweet) and (mention['id'] == tweet['retweeted_status']['user']['id']): # Retweet add_retweet_edge(network, tweet['user']['id'], tweet['retweeted_status']['user']['id'], tweet['created_at'], tweet['id'], list(part)) elif mention['id'] in udmention_list: # mentions in Retweet content add_undirect_mentions_edge(network, tweet['user']['id'], mention['id'], tweet['created_at'], tweet['id'], list(part)) else: # original mentions add_direct_mentions_edge(network, tweet['user']['id'], mention['id'], tweet['created_at'], tweet['id'], list(part))
def get_values_one_field(dbname, colname, fieldname, filt={}): poi = dbt.db_connect_col(dbname, colname) values = [] for item in poi.find(filt, [fieldname], no_cursor_timeout=True): if '.' in fieldname: levels = fieldname.split('.') t = item.get(levels[0], {}) for level in levels[1:]: t = t.get(level) if t is None: t = 0.0 break values.append(t) else: values.append(item.get(fieldname)) # print 'The length of values is: ', len(values) return values
def friend_community(): net = gt.Graph.Read_GraphML('ed_weighted_follow.graphml') # net = gt.load_network('fed', 'snet') gt.net_stat(net) com = net.community_infomap(edge_weights='weight') comclus = com.subgraphs() print len(comclus), com.modularity com = dbt.db_connect_col('fed', 'scom') index = 0 hash_com = {} for comclu in comclus: print '============================================================' # if comclu.vcount() > 10: for v in comclu.vs: user = com.find_one({'id': int(v['name'])}) print v['name'], user['id'], user['screen_name'], ' '.join(user['description'].split()).encode('utf-8') hash_com[v['name']] = index index += 1
def refine_recovery(dbname, netname): ''' refine the users who have use hashtag #recovery :param dbname: :param netname: :return: ''' network = dbutil.db_connect_col(dbname, netname) proed = set(['proed', 'proana', 'promia', 'proanorexia', 'proanamia', 'proanatips', 'proanatip']) proedrel = proed for link in network.find(no_cursor_timeout=True): tags = set(link['tags']) if len(proed.intersection(tags)) > 0: proedrel = proedrel.union(tags) print len(proedrel) users = iot.get_values_one_field(dbname, netname, 'id0') print len(users) for user in users: # print user utags = set() for link in network.find({'id0': user}): utags.add(tag for tag in link['tags']) if len(utags.intersection(proedrel)) == 0: network.delete_many({'id0': user})
def friendship_community_vis(dbname, colname, filename, ctype): '''Out graph for vis.js visualization''' ed_users = iot.get_values_one_field(dbname, 'scom', 'id') # fed_users = iot.get_values_one_field(dbname, 'com', 'id') dbcom = dbt.db_connect_col(dbname, 'com') fg = gt.load_network(dbname, colname) # fg = gt.load_beh_network_subset(ed_users, dbname, colname, 'retweet') gt.net_stat(fg) # fg = fg.as_undirected(mode="mutual") # gt.net_stat(fg) fg = gt.giant_component(fg, 'WEAK') gt.net_stat(fg) if ctype == 'ml': com = fg.community_multilevel(weights='weight', return_levels=False) elif ctype == 'lp': fgu = fg.as_undirected(combine_edges=sum) init = fgu.community_leading_eigenvector(clusters=2, weights='weight') print init.membership com = fg.community_label_propagation(weights='weight', initial=init.membership) print com.membership else: com = fg.community_infomap(edge_weights='weight', trials=2) fg.vs['group'] = com.membership # edges = fg.es.select(weight_gt=3) # print 'Filtered edges: %d' %len(edges) # fg = fg.subgraph_edges(edges) # gt.net_stat(fg) # fg.vs['degree'] = fg.degree(mode="all") # nodes = fg.vs.select(degree_gt=10) # fg = fg.subgraph(nodes) # gt.net_stat(fg) Coo={} for x in fg.vs['group']: Coo[x]=(rand.randint(-1000, 1000), rand.randint(-1000, 1000)) with open('data/' + ctype + '_' +filename+'_net_follow.js', 'w') as fw: fw.write('var nodes = [\n') for idv, v in enumerate(fg.vs): user = dbcom.find_one({'id': int(fg.vs[idv]['name'])}) desc = ' '.join(user['description'].replace('\'', '').replace('\"', '').split()) fw.write('{id: ' + str(idv+1) + ', '+ 'label: \'' + user['screen_name'] +'\', ' + 'value: ' + str(fg.degree(idv, mode="in")) + ', ' + 'title: \'UID: ' + str(fg.vs[idv]['name']) + '<br> Screen Name: ' + user['screen_name'] + '<br> Followers: ' + str(user['followers_count']) + '<br> Followees: ' + str(user['friends_count']) + '<br> Tweets: ' + str(user['statuses_count']) + '<br> Description: ' + str(desc.encode('utf-8')) + '<br> Group: ' + str(fg.vs[idv]['group']) + '\', ' + 'x: ' + str(Coo[fg.vs[idv]['group']][0]+rand.randint(0, 300)) + ', ' + 'y: ' + str(Coo[fg.vs[idv]['group']][1]+rand.randint(0, 300)) + ', ' + 'group: ' + str(fg.vs[idv]['group']) + ', ') # if int(fg.vs[idv]['name']) in ed_users: # fw.write('shape: ' + '\'triangle\'') # else: # fw.write('shape: ' + '\'circle\'') fw.write('}, \n') fw.write('];\n var edges = [\n') for ide, e in enumerate(fg.es): fw.write('{from: ' + str(e.source+1) + ', ' + 'to: ' + str(e.target+1) + ', ' + 'arrows: ' + '\'to\'' + ', ' + 'title: \' Tags: ' + fg.vs[e.source]['name'] + ' ' + fg.vs[e.target]['name'] + '<br> Co-occurrence: ' + str(fg.es[ide]['weight']) + '\', ' + 'value: ' + str(fg.es[ide]['weight']) + '},\n') #str(fg.es[ide]['weight']) fw.write('];\n')
def emotion_dropout_IV_combine(dbname1, dbname2, comname1, comname2): ''' Combine followees and follower together as variables :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}} user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) com1 = dbt.db_connect_col(dbname1, comname1) com2 = dbt.db_connect_col(dbname2, comname2) fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad'] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days'] attr_names = ['uid', 'attr'] attr_names.extend(['u_'+field.split('.')[-1] for field in fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend(['f_'+field.split('.')[-1] for field in fields]) attr_names.extend(['f_'+field for field in prof_names]) attr_names.extend(['f_num', 'f_palive']) print attr_names network1 = gt.load_network(dbname1, 'net') data = [] for uid in user1: row = [uid] u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if u2 is None or u2['timeline_count'] == 0: row.append(1) else: row.append(0) uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) row.extend(active_days(u1)) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: friends = set(network1.neighbors(str(uid))) # id or name if len(friends) > 0: friend_ids = [int(network1.vs[v]['name']) for v in friends] # return id print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu != None: fatt = iot.get_fields_one_doc(fu, fields) fatt.extend(active_days(fu)) fatts.append(fatt) if fu2 is None or fu2['timeline_count'] == 0: alive += 0 else: alive += 1 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv('data-attr-combine.csv', index = False)
def emotion_dropout_IV_split(dbname1, dbname2, comname1, comname2): ''' Split followees and followers as different variables :param dbname1: :param dbname2: :param comname1: :param comname2: :return: ''' filter_que = {'level': 1, 'liwc_anal.result.WC':{'$exists': True}} user1 = iot.get_values_one_field(dbname1, comname1, 'id', filter_que) com1 = dbt.db_connect_col(dbname1, comname1) com2 = dbt.db_connect_col(dbname2, comname2) fields = ['liwc_anal.result.posemo', 'liwc_anal.result.negemo', 'liwc_anal.result.anx', 'liwc_anal.result.anger', 'liwc_anal.result.sad'] prof_names = ['friends_count', 'statuses_count', 'followers_count', 'friends_day', 'statuses_day', 'followers_day', 'days'] attr_names = ['uid', 'attr'] attr_names.extend(['u_'+field.split('.')[-1] for field in fields]) attr_names.extend(['u_'+field for field in prof_names]) attr_names.extend(['fr_'+field.split('.')[-1] for field in fields]) attr_names.extend(['fr_'+field for field in prof_names]) attr_names.extend(['fr_num', 'fr_palive']) attr_names.extend(['fo_'+field.split('.')[-1] for field in fields]) attr_names.extend(['fo_'+field for field in prof_names]) attr_names.extend(['fo_num', 'fo_palive']) attr_names.extend(['co_'+field.split('.')[-1] for field in fields]) attr_names.extend(['co_'+field for field in prof_names]) attr_names.extend(['co_num', 'co_palive']) print attr_names attr_length = len(fields) + len(prof_names) + 2 network1 = gt.load_network(dbname1, 'net') data = [] for uid in user1: row = [uid] u1 = com1.find_one({'id': uid}) u2 = com2.find_one({'id': uid}) if u2 is None or u2['timeline_count'] == 0: row.append(1) else: row.append(0) uatt = iot.get_fields_one_doc(u1, fields) row.extend(uatt) row.extend(active_days(u1)) exist = True try: v = network1.vs.find(name=str(uid)) except ValueError: exist = False if exist: print '--------------------user %d---------------' %uid followees = set([int(network1.vs[v]['name']) for v in network1.successors(str(uid))]) followers = set([int(network1.vs[v]['name']) for v in network1.predecessors(str(uid))]) common = followees.intersection(followers) followees = followees - common followers = followers - common for friend_ids in [followees, followers, common]: if len(friend_ids) > 0: # friend_ids = [int(network1.vs[v]['name']) for v in friends] print uid in friend_ids print len(friend_ids) fatts = [] alive = 0 for fid in friend_ids: fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) fu2 = com2.find_one({'id': fid}) if fu != None: fatt = iot.get_fields_one_doc(fu, fields) # Friends' LIWC fatt.extend(active_days(fu)) fatts.append(fatt) if fu2 is None or fu2['timeline_count'] == 0: alive += 0 else: alive += 1 if len(fatts) > 0: fatts = np.array(fatts) fmatts = np.mean(fatts, axis=0) row.extend(fmatts) row.append(len(fatts)) paliv = float(alive)/len(fatts) print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) row.append(paliv) else: row.extend([None] * attr_length) # friends = followers # followers # if len(friends) > 0: # friend_ids = [int(network1.vs[v]['name']) for v in friends] # print uid in friend_ids # print len(friend_ids) # fatts = [] # alive = 0 # for fid in friend_ids: # fu = com1.find_one({'id': fid, 'liwc_anal.result.WC':{'$exists':True}, 'status':{'$exists':True}}) # fu2 = com2.find_one({'id': fid}) # if fu != None: # fatt = iot.get_fields_one_doc(fu, fields) # fatt.extend(active_days(fu)) # fatts.append(fatt) # if fu2 is None or fu2['timeline_count'] == 0: # alive += 0 # else: # alive += 1 # if len(fatts) > 0: # fatts = np.array(fatts) # fmatts = np.mean(fatts, axis=0) # row.extend(fmatts) # row.append(len(fatts)) # paliv = float(alive)/len(fatts) # print 'Alive %d %d %.3f' % (alive, len(fatts), paliv) # row.append(paliv) # print row data.append(row) df = pd.DataFrame(data, columns=attr_names) df.to_csv('data-attr-split.csv', index = False)