def top_participated_users(users): Bot = bot.load_bot() for key in user_participation: user_participation[key] #postids sort = sorted(user_participation, key = lambda k : len(user_participation[k]), reverse=True) top_100 = [] top_0_1 = [] top_1 = [] for i, item in enumerate(sort): #print(item, screen_name(item), len(user_participation[item]), bot.check_bot(Bot, item)) if bot.check_bot(Bot, item) == 0: if i < 200: top_0_1.append(item) if i < 2000: top_1.append(item) else: print('top 1% ', len(user_participation[item])) break if i < 100: top_100.append(item)
def top_retweeted_users(): Bot = bot.load_bot() dir_name = "RetweetNew/" files = os.listdir(dir_name) tweet_num = 0 users = {} cascade = {} one_cascade = {} all_retweet = {} all_retweet_num = 0 for postid in files: with open(dir_name + postid, 'r') as f: tweets = json.load(f) for tweet in tweets.values(): user = tweet['user'] origin = tweet['origin_tweet'] cascade[origin] = 1 if tweet['cascade'] == 1: one_cascade[origin] = 1 if bot.check_bot(Bot, user) == 0: users[user] = users.get(user, 0) + tweet['child'] tweet_num += 1 all_retweet_num += tweet['child'] print('all users ' ,len(users)) print('all tweets ' , tweet_num) print('all cascades ' , len(cascade)) print('one cascade ', len(one_cascade)) print('all retweet num ' , all_retweet_num) with open('Data/top_retweeted_users', 'w') as f: json.dump(users, f)
def political_alignment_pearson(): with open('Data/user_content_polarity.json', 'r') as f: content_polarity = json.load(f) echo_chamber_users = e_util.get_echo_chamber_users('Data/echo_chamber2.json') files = os.listdir('RetweetNew') e_user = {} ne_user = {} e_source = {} ne_source = {} Bot = bot.load_bot() for ccc, postid in enumerate(files): with open('RetweetNew/' + postid, 'r') as f: tweets = json.load(f) print(ccc, postid, len(tweets)) echo_users = echo_chamber_users[postid] for tweet in tweets.values(): user = tweet['user'] if bot.check_bot(Bot, user) == 1: continue if e_user.get(user, None) != None or ne_user.get(user, None) != None: continue user_politic_score = round(get_polarity(user),4) content_politic_score = content_polarity.get(user, None) if user_politic_score != None and content_politic_score != None: if user in echo_users: e_user[user] = user_politic_score e_source[user] = content_politic_score else: ne_user[user] = user_politic_score ne_source[user] = content_politic_score #if ccc == 10: # break # break e_keys = e_user.keys() ne_keys = ne_user.keys() #print('echo', stats.pearsonr(e_user.values(), e_source.values())) #print('necho', stats.pearsonr(ne_user.values(), ne_source.values())) print('echo', stats.pearsonr([e_user[key] for key in e_keys], [e_source[key] for key in e_keys])) print('necho', stats.pearsonr([ne_user[key] for key in ne_keys], [ne_source[key] for key in ne_keys])) with open('Data/user_polarity_content_polarity.json', 'w') as f: json.dump({'e_user':e_user, 'ne_user' : ne_user, 'e_source' : e_source, 'ne_source' : ne_source},f)
def bot_participation(): Bot = bot.load_bot() dir_name = "RetweetNew/" files = os.listdir(dir_name) bot_list = [] for postid in files: with open(dir_name + postid, 'r') as f: tweets = json.load(f) users = [tweet['user'] for tweet in tweets.values()] bots = [bot.check_bot(Bot, user) for user in users] bot_list.append(bots.count(1) / bots.count(0)) box = BoxPlot(1) box.set_data(bot_list, '') box.set_xticks('bot_ratio') box.save_image('Image/bot_ratio_box.png')
def update(): """ Update retweet graph with Cascade, Bot information """ #cascade calculation cascade = {} child = {} for postid in files: cascade[postid] = {} child[postid] = {} with open(dir_name + postid, 'r') as f: tweets = json.load(f) for key in tweets.keys(): tweet = tweets[key] origin = tweet['origin_tweet'] cascade[postid][origin] = cascade[postid].get(origin, 0) + 1 parent_tweet = tweet['parent_tweet'] if parent_tweet != tweet['tweet']: child[postid][parent_tweet] = child[postid].get(parent_tweet, 0) + 1 #update Bot = bot.load_bot() for postid in files: print(postid) unique_origin = {} with open(dir_name + postid, 'r') as f: tweets = json.load(f) #print(len(tweets)) #print(cascade[postid]) for tweet in tweets.values(): tweet['cascade'] = cascade[postid][tweet['origin_tweet']] tweet['bot'] = bot.check_bot(Bot, tweet['user']) tweet['child'] = child[postid].get(tweet['tweet'], 0) unique_origin[tweet['origin_tweet']] = 1 print('unique root', len(unique_origin)) #for key in unique_origin.keys(): #print(key) # sub_tree_num(tweets, key) #with open(postid, 'w') as f: # json.dump(tweets, f) with open(dir_name+postid, 'w') as f: json.dump(tweets, f)
def get_echo_chamber_users(file_name): #file_name = 'Data/echo_chamber2.json' print(file_name) if 'echo_chamber2.json' in file_name: if os.path.exists('Data/echo_chamber_users2.json'): with open('Data/echo_chamber_users2.json', 'r') as f: echo_chamber_users = json.load(f) print('echo chamber size %s' % len(echo_chamber_users)) return echo_chamber_users with open(file_name) as f: echo_chambers = json.load(f) Bot = bot.load_bot() echo_chamber_users = {} count = 0 for key in echo_chambers: #print(key) users = echo_chambers[key] postids = key.split('_') #bot check for postid in postids: for user in users: if bot.check_bot(Bot, user) == 0: echo_chamber_users[postid] = echo_chamber_users.get( postid, {}) echo_chamber_users[postid][user] = 1 count += 1 print('echo chamber size %s' % count) with open('Data/echo_chamber_users2.json', 'w') as f: json.dump(echo_chamber_users, f) return echo_chamber_users
def time_to_depth_echo_chamber(filename): _, _, time_depth, _, user_depth = get_depth_time_series(None) print(len(time_depth)) #with open('Data/time_series_data.json', 'w') as f: # json.dump({'time_depth' : time_depth, 'user_depth' : user_depth}, f) #with open('Data/time_series_data.json', 'r') as f: # data = json.load(f) #time_depth = data['time_depth'] #user_depth = data['user_depth'] print("time series data load done ") echo_chamber_values = {} non_echo_chamber_values = {} for item in ['time_depth', 'user_depth']: echo_chamber_values[item] = {} non_echo_chamber_values[item] = {} for i in range(1,20): echo_chamber_values[item][i] = [] non_echo_chamber_values[item][i] = [] Bot = bot.load_bot() echo_chamber_cascade_root = {} cascade_veracity = {} echo_chamber_users = e_util.get_echo_chamber_users(filename) files = os.listdir('RetweetNew') #collect echo chamber user participate cascade #for postid in echo_chamber_users.keys(): for postid in files: v = veracity_type(postid).title() #get origin tweet of echo chamber user with open('RetweetNew/%s'%postid, 'r') as f: tweets = json.load(f) for tweet in tweets.values(): try: #if tweet['user'] in echo_chamber_users[postid].keys(): origin = tweet['origin'] otid = tweet['origin_tweet'] #if origin in echo_chamber_users[postid].keys(): if tweet['user'] in echo_chamber_users[postid].keys(): echo_chamber_cascade_root[tweet['origin_tweet']] = 1 except KeyError : pass cascade_veracity[tweet['origin_tweet']] = v print("echo chamber cascade extraction done") echo_chamber_cascades = echo_chamber_cascade_root.keys() print('echo chamber cascades') #print(echo_chamber_cascades) e = {}; n = {}; r = {}; #echo, non echo, ranked echo for item in ['True', 'False', 'Mixed']: e[item] = {} n[item] = {} r[item] = {} for d_type in ['user_depth', 'time_depth']: e[item][d_type] = {} n[item][d_type] = {} r[item][d_type] = {} for i in range(1, 20): e[item][d_type][i] = [] n[item][d_type][i] = [] r[item][d_type][i] = [] for key in time_depth.keys(): v = cascade_veracity[key] if v !='True' and v != 'False': v = 'Mixed' if key in echo_chamber_cascades: #for i in range(1, max(time_depth[key].keys())+1): for i in range(1, max(time_depth[key].keys())+1): try: echo_chamber_values['time_depth'][i].append(time_depth[key][i]) echo_chamber_values['user_depth'][i].append(user_depth[key][i]) e[v]['time_depth'][i].append(time_depth[key][i]) e[v]['user_depth'][i].append(user_depth[key][i]) except KeyError: pass else: for i in range(1, max(time_depth[key].keys())+1): try : non_echo_chamber_values['time_depth'][i].append(time_depth[key][i]) non_echo_chamber_values['user_depth'][i].append(user_depth[key][i]) n[v]['time_depth'][i].append(time_depth[key][i]) n[v]['user_depth'][i].append(user_depth[key][i]) except KeyError: pass box = BoxPlot(1) box.set_multiple_data([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']]) box.set_ylog() box.set_label('Depth', 'Minutes to Depth') box.save_image('%s/time_depth_echo_chamber_box.png'%foldername) print(echo_chamber_values['time_depth']) #draw time to depth, user to depth of cascade for echo chamber users participated or non echo chamer users participated with open('Data/Figure/5_2_1.json', 'w') as f: json.dump([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']], f) draw_time_to_depth_echo_chamber([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']], ['echo chamber', 'no echo chamber'], 'median minutes', 'time_depth_echo_chamber_line') draw_time_to_depth_echo_chamber([echo_chamber_values['user_depth'], non_echo_chamber_values['user_depth']], ['echo chamber', 'no echo chamber'], 'median unique users', 'user_depth_echo_chamber_line') with open('Data/Figure/5_2_time.json', 'w') as f: json.dump({'e':echo_chamber_values['time_depth'][1], 'ne':non_echo_chamber_values['time_depth'][1]}, f) #draw cdf with top retweet cdf = CDFPlot() cdf.set_label('Propagation Time', 'CDF') cdf.set_log(True) #cdf.set_ylog() cdf.set_data(echo_chamber_values['time_depth'][1], '') cdf.set_data(non_echo_chamber_values['time_depth'][1], '') cdf.save_image('Image/20181105/depth_propagation_time_cdf.png') """
def get_depth_time_series(veracity): dir_name = "RetweetNew/" files = os.listdir(dir_name) unique_d = {} count = 0 depth_time = {} depth_user = {} cascade_depth_users = {} cascade_depth = {} userid_cascade= {} #user <-> origin_tweet for cascade depth cascade_unique_users = {} #root user Bot = bot.load_bot() for postid in files: #if postid != '126119': # continue if veracity != None: if not get_veracity(postid, veracity): continue #print(postid) unique_d[postid] = {} with open(dir_name + postid, 'r') as f: tweets = json.load(f) sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time #new_list = sorted(sort.items(), key=lambda x: x[1]) new_list = sorted(sort.items(), key=lambda x: x) start_time = new_list[0][1] sorted_ids = [item[0] for item in new_list] cascade_unique_users[postid] = {} unique_users = {} max_depth = 0 cascade_tweet = {} cascade_max_depth = {} for i, tid in enumerate(sorted_ids): tweet = tweets[tid] unique_users[tweet['user']] = 1 #order by depth can decrease time #time to get to the depth in a rumor. not care about which cascade is in if max_depth < tweet['depth']: max_depth = tweet['depth'] depth_time[max_depth] = depth_time.get(max_depth, {}) elapsed_time = (new_list[i][1] - start_time).total_seconds() / 60 #min depth_time[max_depth][postid] = elapsed_time depth_user[max_depth] = depth_user.get(max_depth, {}) depth_user[max_depth][postid] = len(unique_users) #time to get to the depth in a cascade. #depth by cascade origin_tweet = tweet['origin_tweet'] #if bot.check_bot(Bot, tweet['origin']) == 1: # continue #if origin_tweet != '1018876220480606208': # continue #userid_cascade[tweet['user']] = origin_tweet userid_cascade[tweet['user']] = userid_cascade.get(tweet['user'], []) userid_cascade[tweet['user']].append(origin_tweet) cascade_tweet[origin_tweet] = 1 t_depth = tweet['depth'] cascade_depth[origin_tweet] = cascade_depth.get(origin_tweet, {}) if len(cascade_depth[origin_tweet]) == 0: cascade_depth[origin_tweet][t_depth] = new_list[i][1] #depth 1, start time of cascade if cascade_depth[origin_tweet].get(t_depth, -1) == -1: cascade_depth[origin_tweet][t_depth] = new_list[i][1] cascade_unique_users[postid][origin_tweet] = cascade_unique_users[postid].get(origin_tweet, {}) cascade_unique_users[postid][origin_tweet][tweet['user']] = 1 cascade_max_depth[origin_tweet] = cascade_max_depth.get(origin_tweet, 0) if cascade_max_depth[origin_tweet] < tweet['depth']: cascade_max_depth[origin_tweet] = tweet['depth'] cascade_depth_users[origin_tweet] = cascade_depth_users.get(origin_tweet, {}) cascade_depth_users[origin_tweet][tweet['depth']] = len(cascade_unique_users[postid][origin_tweet]) #if tweet['depth'] == 17: # print(postid, origin_tweet, 17) #if origin_tweet == '1008855051895476225': # print(tweet['user'], origin_tweet, len(cascade_unique_users[origin_tweet])) # print(cascade_depth_users[origin_tweet]) # from time import sleep # sleep(0.3) count += 1 #if count > 5 : #break for key in cascade_depth.keys(): times = cascade_depth[key] depth_all = times.keys() max_depth = max(depth_all) if max_depth == 1: times[1] = 0 for i in range(max_depth, 0, -1): if i == 1: times[i] = 0 break time_diff = (times[i] - times[1]).total_seconds() / 60 if (time_diff) < 0: print(key) print(times[i], times[i-1]) times[i] = time_diff cascade_depth[key] = times #print(cascade_depth) #print(cascade_depth) #depth_time, depth_user - time or user to depth of rumor #cascade_depth, cascade_depth_users - time or user to depth of cascade return depth_time, depth_user, cascade_depth, userid_cascade, cascade_depth_users
def echo_chamber_diversity(filename): Bot = bot.load_bot() dirname = 'Retweet/' files = os.listdir(dirname) if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_tweet_diversity = []; echo_source_diversity = []; necho_tweet_diversity = []; necho_source_diversity = []; for postid in files: with open(dirname + postid) as f: tweets = json.load(f) non_echo_users = {} for tweet in tweets.values(): user = tweet['user'] #non echo chamber collect if not user in echo_chamber_users[postid]: non_echo_users[user] = 1 print(len(echo_chamber_users[postid]), len(non_echo_users)) timeline_dir = '../Timeline/' #collect echo chamber users' source diversity err = 0; nerr = 0 for user in echo_chamber_users[postid]: try: with open(timeline_dir + user, 'r') as f: user_tweets = json.load(f) except IOError as e: #print(e) err +=1 continue tweet_diversity, source_diversity = get_diversity(user_tweets) if tweet_diversity != None: echo_tweet_diversity.append(tweet_diversity) if source_diversity != None: echo_source_diversity.append(source_diversity) for user in non_echo_users: try: with open(timeline_dir + user, 'r') as f: user_tweets = json.load(f) except IOError as e: #print(e) nerr += 1 continue tweet_diversity, source_diversity = get_diversity(user_tweets) if tweet_diversity != None: necho_tweet_diversity.append(tweet_diversity) if source_diversity != None: necho_source_diversity.append(source_diversity) #print(err, nerr) #break #CDF cdf = CDFPlot() cdf.set_label('Retweet Origin Diversity', 'CDF') #cdf.set_log(True) cdf.set_data(echo_tweet_diversity, 'Echo Chamber') cdf.set_data(necho_tweet_diversity, 'Non Echo Chamber') cdf.set_legends(['Echo CHamber', 'Non Echo CHamber'], 'User Type') cdf.save_image('Image/20181002/source_diversity_retweet_cdf.png') cdf = CDFPlot() cdf.set_label('Source News Diversity', 'CDF') #cdf.set_log(True) cdf.set_data(echo_source_diversity, 'Echo Chamber') cdf.set_data(necho_source_diversity, 'Non Echo Chamber') cdf.set_legends(['Echo CHamber', 'Non Echo CHamber'], 'User Type') cdf.save_image('Image/20181002/source_diversity_news_cdf.png') #BoxPlot box = BoxPlot(1) box.set_data([echo_tweet_diversity, necho_tweet_diversity],'') box.set_xticks(['Echo Chamber', 'Non Echo Chamber', 'All']) box.set_label('', 'Retweet Origin Diversity') box.save_image('Image/20181002/source_diversity_retweet.png') box = BoxPlot(1) box.set_data([echo_source_diversity, necho_source_diversity],'') box.set_xticks(['Echo Chamber', 'Non Echo Chamber', 'All']) box.set_label('', 'Source News Diversity') box.save_image('Image/20181002/source_diversity_news.png')
def propagation_parent_to_child(): Bot = bot.load_bot() dirname = 'RetweetNew/' files = os.listdir(dirname) filename = 'Data/echo_chamber2.json' if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_chamber_cascades = {} tweet_cache = {} ''' for postid in echo_chamber_users.keys(): users = echo_chamber_users[postid] #echo chamber users with open('RetweetNew/' + postid, 'r') as f: tweets = json.load(f) tweet_cache[postid] = tweets for tweet in tweets.values(): if tweet['user'] in users: root_id = tweet['origin_tweet'] #root tweet id echo_chamber_cascades[root_id] = 1 echo_chamber_cascades_ids = echo_chamber_cascades.keys() ''' #print(echo_chamber_cascades_ids) e_child = {} ne_child = {} e_time = {} ne_time = {} ne_time2 = {} for i in range(1, 20): e_child[i] = [] ne_child[i] = [] e_time[i] = {} ne_time[i] = {} ne_time2[i] = {} print(len(echo_chamber_users.keys())) for ccc, postid in enumerate(files): #if postid != '150232' and postid != '29947': # continue with open(dirname + postid, 'r') as f: tweets = json.load(f) #tweets = tweet_cache[postid] #if not util.is_politics(postid): #if not util.is_non_politics(postid): #if not util.is_veracity(postid, 'False'): #if not util.is_veracity(postid, 'Mixture,Mostly False,Mostly True'): # continue #order by timeline sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time new_list = sorted(sort.items(), key=lambda x: x[1]) sorted_ids = [item[0] for item in new_list] e_users = echo_chamber_users[postid] #e_users = echo_chamber_users.get(postid, []) print(len(e_users)) for i, tid in enumerate(sorted_ids): tweet = tweets[tid]['tweet'] parent = tweets[tid]['parent'] origin = tweets[tid]['origin'] root = tweets[tid]['origin_tweet'] cascade = tweets[tid]['cascade'] userid = tweets[tid]['user'] ptid = tweets[tid]['parent_tweet'] if cascade < 2: continue #bot filter if bot.check_bot(Bot, userid) != 0: continue if userid in e_users: e_child[tweets[tid]['depth']].append(tweets[tid]['child']) else: ne_child[tweets[tid]['depth']].append(tweets[tid]['child']) if tweets[tid]['depth'] > 1: diff = (parser.parse(tweets[tid]['time']) - parser.parse( tweets[ptid]['time'])).total_seconds() / 60 if e_time[tweets[ptid]['depth']].get(ptid, -1) > diff: print(e_time[tweets[ptid]['depth']][ptid], diff) if parent in e_users: # if origin in e_users: if e_time[tweets[ptid]['depth']].get(ptid, -1) == -1: e_time[tweets[ptid]['depth']][ptid] = diff else: if ne_time[tweets[ptid]['depth']].get(ptid, -1) == -1: ne_time[tweets[ptid]['depth']][ptid] = diff #if ccc == 5: # break #remove child 0 count for i in range(1, 20): e_child[i] = [x for x in e_child[i] if x != 0] ne_child[i] = [x for x in ne_child[i] if x != 0] box = BoxPlot(1) box.set_multiple_data([e_child, ne_child]) box.set_ylog() box.set_label('Depth', 'Child Count') box.save_image('Image/%s/child_num_wo_propagation.png' % folder) for i in range(1, 20): e_time[i] = e_time[i].values() ne_time[i] = ne_time[i].values() ne_time2[i] = ne_time2[i].values() #print(e_time) #print(ne_time) box = BoxPlot(1) box.set_multiple_data([e_time, ne_time]) box.set_ylog() box.set_label('Depth', 'Propagation Time') box.save_image('Image/%s/child_time_propagation.png' % folder) with open('Data/Figure/5_3_1.json', 'w') as f: json.dump( { 'e_time': e_time, 'ne_time': ne_time, 'e_child': e_child, 'ne_child': ne_child }, f)
def propagation_time_to_group(filename): #get all echo chamber users #filename = 'Data/echo_chamber2.json' Bot = bot.load_bot() dirname = 'RetweetNew/' files = os.listdir(dirname) if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_p = [] echo_r = [] necho_p = [] necho_r = [] for ccc, postid in enumerate(files): #if postid != '150232' and postid != '29947': # continue with open(dirname + postid, 'r') as f: tweets = json.load(f) #order by timeline sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time new_list = sorted(sort.items(), key=lambda x: x[1]) sorted_ids = [item[0] for item in new_list] parent_child = {} parent_start = {} root_start = {} #make one dictionary parent - children echo_chamber_parent = {} echo_chamber_root = {} echo_chamber_tweet = {} #print('echo_chamber user num ', len(echo_chamber_users[postid])) for i, tid in enumerate(sorted_ids): tweet = tweets[tid]['tweet'] parent = tweets[tid]['parent_tweet'] root = tweets[tid]['origin_tweet'] cascade = tweets[tid]['cascade'] if cascade < 2: continue #bot filter if bot.check_bot(Bot, tweets[tid]['user']) != 0: continue #save all the parent, root start time if root_start.get(root, None) == None: root_start[root] = new_list[i][1] if parent_start.get(parent, None) == None: parent_start[parent] = new_list[i][1] if tweets[tid]['user'] in echo_chamber_users[postid]: echo_chamber_tweet[tid] = 1 #if tweets[tid]['parent'] in echo_chamber_users[postid]: # echo_chamber_parent[parent] = 1 #if tweets[tid]['origin'] in echo_chamber_users[postid]: # echo_chamber_root[root] = 1 #parent_child_diff[key].append((time-start_time).total_seconds() / 60) echo_parent = 0 necho_parent = 0 for tweet in tweets.values(): tid = tweet['tweet'] pid = tweet['parent_tweet'] rid = tweet['origin_tweet'] #bot filter if bot.check_bot(Bot, tweets[tid]['user']) != 0: continue if tid != pid: #not root r_time = (parser.parse(tweets[tid]['time']) - parser.parse( tweets[rid]['time'])).total_seconds() / 60 p_time = (parser.parse(tweets[tid]['time']) - parser.parse( tweets[pid]['time'])).total_seconds() / 60 if tweet['parent_tweet'] in echo_chamber_tweet.keys(): echo_p.append(p_time) else: necho_p.append(p_time) if tweet['origin_tweet'] in echo_chamber_tweet.keys(): echo_r.append(r_time) else: necho_r.append(r_time) if ccc % 10 == 0: print(ccc) return echo_p, necho_p, echo_r, necho_r
def rumor_propagation_velocity(filename): #get all echo chamber users #filename = 'Data/echo_chamber2.json' Bot = bot.load_bot() dirname = 'RetweetNew/' files = os.listdir(dirname) if filename == None: echo_chamber_users = {} for postid in files: echo_chamber_users[postid] = {} else: echo_chamber_users = e_util.get_echo_chamber_users(filename) echo_v = [] necho_v = [] #propagation time to all node's children #parent --> last child echo_p = {} necho_p = {} for i in range(1, 20): echo_p[i] = [] necho_p[i] = [] tweet_depth = {} for ccc, postid in enumerate(files): with open(dirname + postid, 'r') as f: tweets = json.load(f) #order by timeline sort = {} for key in tweets.keys(): tweet = tweets[key] sort[key] = parser.parse(tweet['time']) #sort by time new_list = sorted(sort.items(), key=lambda x: x[1]) sorted_ids = [item[0] for item in new_list] #make one dictionary parent - children parent_child = {} echo_chamber_parent = {} for i, tid in enumerate(sorted_ids): tweet = tweets[tid]['tweet'] parent = tweets[tid]['parent_tweet'] cascade = tweets[tid]['cascade'] if cascade < 2: continue #bot filter if bot.check_bot(Bot, tweets[tid]['user']) != 0: continue if tweet != parent: parent_child[parent] = parent_child.get(parent, []) #parent comes always earlier then child if len(parent_child[parent]) == 0: #add parent time into index 0 parent_child[parent].append( parser.parse(tweets[parent]['time'])) #time or tweet? parent_child[parent].append(new_list[i][1]) tweet_depth[parent] = tweets[parent]['depth'] else: #root tweet of cascade parent_child[parent] = [new_list[i][1]] if len(tweets[parent]) != 0: #parent is echo chamber or not if tweets[tid]['parent'] in echo_chamber_users[postid]: echo_chamber_parent[parent] = 1 #insert time diff from start time parent_child_diff = {} parent_child_median_diff = {} for key in parent_child.keys(): times = parent_child[key] #print(times) #print((max(times) - min(times)).total_seconds() / 60) parent_child_diff[key] = ( (max(times) - min(times)).total_seconds() / 60) parent_child_median_diff[key] = [] for i, time in enumerate(times): if i == 0: start_time = time continue parent_child_median_diff[key].append( (time - start_time).total_seconds() / 60) echo_parent = 0 necho_parent = 0 for key in parent_child_diff: if key in echo_chamber_parent.keys(): echo_parent += 1 #if len(parent_child_diff[key]) == 0: if parent_child_diff[key] == 0: continue #echo_p[tweet_depth[key]].append(parent_child_diff[key]) echo_p[tweet_depth[key]].append( np.median(parent_child_median_diff[key])) echo_v.append(parent_child_diff[key]) #echo_v.append(np.median(parent_child_diff[key])) else: necho_parent += 1 #if len(parent_child_diff[key]) == 0: if parent_child_diff[key] == 0: continue #necho_p[tweet_depth[key]].append(parent_child_diff[key]) necho_p[tweet_depth[key]].append( np.median(parent_child_median_diff[key])) necho_v.append(parent_child_diff[key]) #necho_v.append(np.median(parent_child_diff[key])) #print('echo') #print(echo_p) #print('necho') #print(necho_p) #if ccc == 10: # break return echo_v, necho_v, echo_p, necho_p
def get_tweet(path): ready = False with open(path, 'r') as f: lines = fileinput.FileInput(path) t = {} unique_u = {} unique_f = {} Bot = bot.load_bot() for line in lines: #print(line) tweet_dict = json.loads(line) tweet = Tweet(tweet_dict) t_id1 = tweet['id_str'] u_id1 = tweet['user']['id_str'] tweet1 = tweet['text'] screen_name = tweet['user']['screen_name'] time1 = tweet['created_at'] unique_u[u_id1] = 1 if bot.check_bot(Bot, t_id1) == 1: continue #isretweeted try: #retweet = tweet['retweeted_status'] retweet = tweet.get('retweeted_status', None) if retweet == None: retweet = tweet.get('quoted_status', None) if retweet == None: t[t_id1] = { 'user': u_id1, 'parent': u_id1, 'origin': u_id1, 'confirm': True, 'text': tweet1, 'origin_tweet': t_id1, 'parent_tweet': t_id1, 'tweet': t_id1, 'screen_name': screen_name, 'origin_name': screen_name, 'time': time1, 'depth': 1 } else: tweet2 = retweet['text'] t_id2 = retweet['id_str'] u_id2 = retweet['user']['id_str'] origin_name = retweet['user']['screen_name'] time2 = retweet['created_at'] t[t_id1] = { 'user': u_id1, 'parent': u_id2, 'origin': u_id2, 'confirm': False, 'text': tweet1, 'origin_tweet': t_id2, 'parent_tweet': t_id2, 'tweet': t_id1, 'screen_name': screen_name, 'origin_name': origin_name, 'time': time1, 'depth': 2 } t[t_id2] = { 'user': u_id2, 'parent': u_id2, 'origin': u_id2, 'confirm': True, 'text': tweet2, 'origin_tweet': t_id2, 'parent_tweet': t_id2, 'tweet': t_id2, 'screen_name': origin_name, 'origin_name': origin_name, 'time': time2, 'depth': 1 } unique_u[u_id2] = 1 except KeyError as e: #no retweeted print("Key Error Exception!!!!") t[t_id1] = { 'user': u_id1, 'parent': u_id1, 'origin': u_id1, 'confirm': True, 'text': tweet1, 'origin_tweet': t_id1, 'parent_tweet': t_id1, 'tweet': t_id1, 'screen_name': screen_name, 'origin_name': screen_name, 'time': time1, 'depth': 1 } #print(tweet.created_at_string, tweet.all_text) # if follower, origin_follwer, friends counts are same as unique users, then struct retweet networks # and the number of tweets are more than 100, else return None f_count = 0 fr_count = 0 for uid in unique_u.keys(): user_path = '../Data/followers/followers/' + uid if os.path.exists(user_path): f_count += 1 for uid in unique_u.keys(): user_path = '../Data/friends/friends/' + uid if os.path.exists(user_path): fr_count += 1 if len(t) <= 100: return 0, None print(path) print( 'unique_users : %s , collected followers : %s, collected friends : %s' % (len(unique_u), f_count, fr_count)) if f_count == len(unique_u) and fr_count == len(unique_u): print('%s : %s tweets' % (path, len(t))) return 1, t elif f_count == len(unique_u): print('%s : %s tweets' % (path, len(t))) return 2, t elif fr_count == len(unique_u): return 3, t else: return 0, t