def cascade_change(): with open('Data/echo_chamber2.json', 'r') as f: echo_chambers = json.load(f) count = 0 for keys in echo_chambers.keys(): users = echo_chambers[keys] cascade_series, user_index = get_cascade_time_series(keys, users) for key in cascade_series.keys(): cascade = cascade_series[key] user = user_index[key] line = LinePlot() line.set_ylog() line.set_label('Cascade', 'User') line.set_xlog() line.set_axvline(user) line.set_plot_data(cascade, np.arange(1, len(cascade) + 1, 1)) #line.set_plot_data(np.arange(1, len(cascade) + 1, 1), cascade) #line.set_hline(user, 0, len(cascade)) #line.set_legends(['True', 'False', 'Mixed']) #line.set_xticks(x_ticks) line.save_image('Image/Cascade/cascade_change_line_%s_%s.png' % (keys, key)) count += 1 if count > 10: break
def user_to_depth(user_depth, user_depth2, user_depth3): #time_depth = get_depth_time_series('True') #time_depth2 = get_depth_time_series('False') #time_depth3 = get_depth_time_series('Mixture,Mostly True,Mostly False') #mean time to get to depth print(user_depth) x_ticks = np.arange(0, 20, 1) x_ticks1 = user_depth.keys() x_ticks2 = user_depth2.keys() x_ticks3 = user_depth3.keys() y_ticks1 = [np.mean(user_depth[depth].values()) for depth in x_ticks1] y_ticks2 = [np.mean(user_depth2[depth].values()) for depth in x_ticks2] y_ticks3 = [np.mean(user_depth3[depth].values()) for depth in x_ticks3] if len(x_ticks1) > len(x_ticks2) and len(x_ticks1) > len(x_ticks2): x_ticks = x_ticks1 elif len(x_ticks2) > len(x_ticks1) and len(x_ticks2) > len(x_ticks3): x_ticks = x_ticks2 else: x_ticks = x_ticks3 line = LinePlot() line.set_ylog() line.set_label('Depth', 'Mean Unique Users') line.set_plot_data(y_ticks1, x_ticks) line.set_plot_data(y_ticks2, x_ticks) line.set_plot_data(y_ticks3, x_ticks) line.set_legends(['True', 'False', 'Mixed']) line.set_xticks(x_ticks) line.save_image('Image/user_depth_line.png')
def breadth_change(): with open('Data/echo_chamber2.json', 'r') as f: echo_chambers = json.load(f) count = 0 for keys in echo_chambers.keys(): users = echo_chambers[keys] breadth_series, user_index = get_breadth_time_series(keys, users) for key in breadth_series.keys(): breadth = breadth_series[key] user = user_index[key] line = LinePlot() #line.set_ylog() line.set_label('Users', 'Breadth') line.set_plot_data(breadth, np.arange(1, len(breadth) + 1, 1)) line.set_axvline(user) #line.set_legends(['True', 'False', 'Mixed']) #line.set_xticks(x_ticks) line.save_image('Image/Breadth/breadth_change_line_%s_%s.png' % (keys, key)) count += 1 if count > 0: break
def velocity_change(): with open('Data/echo_chamber2.json', 'r') as f: echo_chambers = json.load(f) count = 0 for keys in echo_chambers.keys(): users = echo_chambers[keys] velocity_series, user_index, published_index = get_velocity_time_series( keys, users, 'True,False,Mixture,Mostly False,Mostly True') for key in velocity_series.keys(): velocity = velocity_series[key] user = user_index[key] published_date = published_index[key] line = LinePlot() line.set_ylog() line.set_label('User', 'Time Diff') line.set_axvline(user, published_date) line.set_plot_data(velocity, np.arange(1, len(velocity) + 1, 1)) #line.set_plot_data(np.arange(1, len(velocity) + 1, 1), velocity) #line.set_hline(user, 0, len(velocity)) #line.set_legends(['True', 'False', 'Mixed']) #line.set_xticks(x_ticks) line.save_image('Image/Velocity/velocity_change_line_%s_%s.png' % (keys, key)) count += 1 if count > 100: break
def draw_5_2_1_figures(): with open('Data/Figure/5_2_1.json', 'r') as f: data = json.load(f) #print(data) #draw_time_to_depth_echo_chamber([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']], ['echo chamber', 'no echo chamber'], 'median minutes', 'time_depth_echo_chamber_line') #x_ticks = np.arange(1,18) x_ticks = range(1, 18) line = LinePlot() line.set_ylog() line.set_label('Depth', 'Depth Increment Time') xtick_labels = [] print('sadfasdfasdfasdf') print(len(data)) x_tickslabel = range(0, 17) x_tickslabel.append('') for item in data: #yticks = [np.mean(item[depth]) for depth in x_ticks] yticks = [np.median(item[str(depth)]) for depth in x_ticks] #u_ticks1 = [np.mean(outlier.remove_outlier(item[depth])) for depth in x_ticks] print(yticks) line.set_plot_data(yticks, x_tickslabel) print(x_ticks) #print(x_ticks[0]['time_depth']) #print(x_ticks[1]['time_depth']) line.set_legends(['Echo chamber', 'Non-echo chamber']) line.set_xticks(x_tickslabel) line.set_yticks(['0', '1 m', '5 m', '1 h', '1 day', '10 day'], index=[0, 1, 5, 60, 24 * 60, 24 * 10 * 60]) line.save_image('Image/Figure/5_2_1.png') print(xtick_labels)
def draw_time_to_depth_echo_chamber(data, legend, data_type, filename): x_ticks = np.arange(1,20) line = LinePlot() line.set_ylog() line.set_label('Depth', data_type) for item in data: #yticks = [np.mean(item[depth]) for depth in x_ticks] yticks = [np.median(item[depth]) for depth in x_ticks] #u_ticks1 = [np.mean(outlier.remove_outlier(item[depth])) for depth in x_ticks] line.set_plot_data(yticks, x_ticks) line.set_legends(legend) line.set_xticks(x_ticks) line.save_image('%s/%s.png'%(foldername, filename))
def time_to_depth(): #index = filename.replace(".json", "").split('echo_chamber') #print(index) _, _, time_depth_cascade, user_ids, cascade_depth_users = get_depth_time_series('True') _, _, time_depth_cascade2, user_ids2, cascade_depth_users2 = get_depth_time_series('False') _, _, time_depth_cascade3, user_ids3, cascade_depth_users3 = get_depth_time_series('Mixture,Mostly True,Mostly False') t = {}; e = {}; u_all = {} #true, false, mixture time to depth t_td = {}; f_td = {}; m_td = {}; #true, false, mixture user to depth t_ud = {}; f_ud = {}; m_ud = {}; for i in range(1,20): t[i] = [] e[i] = [] u_all[i] = [] t_td[i] = [] f_td[i] = [] m_td[i] = [] t_ud[i] = [] f_ud[i] = [] m_ud[i] = [] for key in time_depth_cascade.keys(): for i in range(1, max(time_depth_cascade[key].keys())): t[i].append(time_depth_cascade[key][i]) # 1 ~ max_depth t_td[i].append(time_depth_cascade[key][i]) # 1 ~ max_depth try: u_all[i].append(cascade_depth_users[key][i]) t_ud[i].append(cascade_depth_users[key][i]) except KeyError : pass for key in time_depth_cascade2.keys(): for i in range(1, max(time_depth_cascade2[key].keys())): f_td[i].append(time_depth_cascade2[key][i]) # 1 ~ max_depth try: f_ud[i].append(cascade_depth_users2[key][i]) except KeyError : pass for key in time_depth_cascade3.keys(): for i in range(1, max(time_depth_cascade3[key].keys())): m_td[i].append(time_depth_cascade3[key][i]) # 1 ~ max_depth try: m_ud[i].append(cascade_depth_users3[key][i]) except KeyError : pass x_ticks = np.arange(1,20) """ depth_list = [] veracity_list = [] time_list = [] for depth in x_ticks: for value in t_td[depth]: depth_list.append(depth) time_list.append(value) veracity_list.append('True') for value in f_td[depth]: depth_list.append(depth) time_list.append(value) veracity_list.append('False') for value in m_td[depth]: depth_list.append(depth) time_list.append(value) veracity_list.append('Mixed') df = pd.DataFrame({'time':time_list, 'depth':depth_list, 'type':veracity_list}) line = LinePlot() line.set_sns_plot(df) """ y_ticks1 = [np.median(t_td[depth]) for depth in x_ticks] y_ticks2 = [np.median(f_td[depth]) for depth in x_ticks] y_ticks3 = [np.median(m_td[depth]) for depth in x_ticks] print(y_ticks1) print(y_ticks2) print(y_ticks3) line = LinePlot() line.set_ylog() line.set_label('Depth', 'Median Minutes') line.set_plot_data(y_ticks1, x_ticks) line.set_plot_data(y_ticks2, x_ticks) line.set_plot_data(y_ticks3, x_ticks) line.set_legends(['True', 'False', 'Mixed']) line.set_xticks(x_ticks) line.save_image('%s/time_depth_line_echo_chamber.png'%(foldername)) #number of users to depth u_ticks1 = [np.mean(t_ud[depth]) for depth in x_ticks] u_ticks2 = [np.mean(f_ud[depth]) for depth in x_ticks] u_ticks3 = [np.mean(m_ud[depth]) for depth in x_ticks] print(u_ticks1) line = LinePlot() line.set_ylog() line.set_label('Depth', 'Mean Unique Users') line.set_plot_data(u_ticks1, x_ticks) line.set_plot_data(u_ticks2, x_ticks) line.set_plot_data(u_ticks3, x_ticks) line.set_legends(['True', 'False', 'Mixed']) line.set_xticks(x_ticks) line.save_image('%s/user_depth_line_echo_chamber.png'%(foldername))
def draw_graph(): depth_time1, depth_user1, unique_user_time1, cascade_depth1 = time_series('True') x_ticks1 = depth_time1.keys() y_ticks1 = [np.mean(depth_time1[depth].values()) for depth in x_ticks1] depth_time2, depth_user2, unique_user_time2, cascade_depth2 = time_series('False') x_ticks2 = depth_time2.keys() y_ticks2 = [np.mean(depth_time2[depth].values()) for depth in x_ticks1] #draw mean minutes - depth line plot line = LinePlot() line.set_ylog() line.set_label('Depth', 'Mean Minutes') line.set_plot_data([y_ticks1, y_ticks2], x_ticks1) line.set_legends(['True', 'False']) line.save_image('Image/time_depth_line.png') x_ticks1 = unique_user_time1.keys() x_ticks2 = unique_user_time2.keys() x_ticks1 = sorted(x_ticks1) y_ticks1 = [np.mean(unique_user_time1[num].values()) for num in x_ticks1] y_ticks2 = [np.mean(unique_user_time2[num].values()) for num in x_ticks2] #draw mean minutes - unique users line plot line = LinePlot() line.set_ylog() line.set_label('Unique Users', 'Mean Minutes') line.set_plot_data([y_ticks1, y_ticks2], x_ticks1) line.set_xticks(x_ticks1) line.set_legends(['True', 'False']) line.save_image('Image/time_users_line.png') all_depth_true = [[key] * len(depth_time1[key]) for key in depth_time1.keys()] #True all_depth_false = [[key] * len(depth_time2[key]) for key in depth_time2.keys()] #True all_depth_sum_true = [] all_depth_sum_false = [] for item in all_depth_true: all_depth_sum_true.extend(item) for item in all_depth_false: all_depth_sum_false.extend(item) #Depth CDF, CCDF #cdf = CDFPlot() #cdf.set_data(all_depth_sum_true, 'True') #cdf.set_data(all_depth_sum_false, 'False') #cdf.set_legends(['True', 'False'], '') #cdf.save_image('Image/depth_cdf.png') true_cascade = [] false_cascade = [] for postid in cascade_depth1.keys(): for depth in cascade_depth1[postid].values(): #origin tweet : depth true_cascade.extend(depth) for postid in cascade_depth2.keys(): for depth in cascade_depth2[postid].values(): #origin tweet : depth false_cascade.extend(depth) print('true') for i in range(1, 15): print(i, true_cascade.count(i)) print('false') for i in range(1, 15): print(i, false_cascade.count(i)) cdf = CDFPlot() cdf.set_legends(['True', 'False'], '') cdf.set_xlim(0, 11) #cdf.set_log(True) #cdf.set_ylog() cdf.set_label('Depth', 'CDF') cdf.set_data(true_cascade, 'True') cdf.set_data(false_cascade, 'False') cdf.save_image('Image/depth_cdf.png')
category_all.append(item) category_count = [] for c in category_all: count_list = [] for i in range(2011, 2018): count = category_count_by_year(i, c) count_list.append(count) category_count.append(count_list) #print(category_count) df = pd.DataFrame(category_count, index = category_all, columns = range(2011, 2018)) #print(df) LinePlt = LinePlot() LinePlt.set_label('year', 'number of articles') LinePlt.set_plot_data(category_count, 'category count') LinePlt.set_xticks(range(2011, 2018)) LinePlt.set_legends(category_all) LinePlt.save_image('./image/category_count_year.png') print("top key words by category") writer = pd.ExcelWriter('./trending_words/trending_keywords_category.xlsx', engine='xlsxwriter') dataframe_list = [] category_data = [] for item in categories: words = frequency(titles_category(item)) category_data.append(tuple_to_string(words)) df = pd.DataFrame(category_data, index = categories, columns = range(1,11)) dataframe_list.append(df) #df.to_csv('./trending_words/%s.csv'%i, encoding='utf-8')
def trim_trends(trend_values): trend_values = trend_values.replace('[', '').replace(']', '') trend_values = trend_values.replace('\n', '').split() trend_values = map(int, np.array(trend_values)) return trend_values if __name__ == '__main__': conn, cursor, = sql_connect() trend_values = [trim_trends(value) for value in get_all_trends()] LinePlt = LinePlot() LinePlt.set_label('days', 'trends') LinePlt.set_plot_data(trend_values, 'Google Trends') #LinePlt.set_xticks(range(2011, 2018)) #LinePlt.set_legends(category_all) LinePlt.save_image('./image/trends_line.png') #draw with mean value trend_values = np.array(trend_values) trend_mean = np.mean(trend_values, axis=0) print(trend_mean) trend_mean = trend_mean.tolist() print(trend_mean) LinePlt = LinePlot() LinePlt.set_label('days', 'trends') LinePlt.set_plot_data(trend_mean, 'Google Trends') #LinePlt.set_xticks(range(2011, 2018)) #LinePlt.set_legends(category_all)
def edge_homogeneity(): files = os.listdir(dir_name) retweet_cache = {} homogeneity = [] for ccc, postid in enumerate(files): #users_polarity[postid] = {} with open(dir_name + '%s'%postid, 'r') as f: tweets = json.load(f) retweet_cache[postid] = tweets for tweet in tweets.values(): p_score = get_polarity(tweet['user']) #calculate edge homogeneity if tweet['depth'] != 1: #compare with parents if parent is not root node p_score2 = get_polarity(tweet['parent']) if p_score == -999 or p_score2 == -999: continue e = p_score * p_score2 #print(p_score, p_score2, round(e, 1)) homogeneity.append(round(e, 1)) # if ccc == 10: # break #compare with echo chamber node's edge homogeneity echo_chamber_users = {} e_homogeneity = [] ne_homogeneity = [] with open('Data/echo_chamber2.json') as f: echo_chamber = json.load(f) for key in echo_chamber: users = echo_chamber[key] if len(users) < 1: continue for postid in key.split('_'): echo_chamber_users[postid] = echo_chamber_users.get(postid, {}) for user in users: echo_chamber_users[postid][user] = 1 for postid in echo_chamber_users.keys(): tweets = retweet_cache[postid] for tweet in tweets.values(): #echo chamber user's edge homogeneity if tweet['depth'] != 1: p_score = get_polarity(tweet['user']) p_score2 = get_polarity(tweet['parent']) if p_score == -999 or p_score2 == -999: continue e = p_score * p_score2 #print(p_score, p_score2, round(e, 1)) if tweet['user'] in echo_chamber_users[postid].keys(): e_homogeneity.append(e) #e_homogeneity.append(round(e, 1)) else: ne_homogeneity.append(e) #ne_homogeneity.append(round(e, 1)) draw_cdf_plot([e_homogeneity, ne_homogeneity], 'Homogenety', ['Echo Chambers', 'Non-Echo Chambers'], 'User type', 'homogeneity') with open('Data/homogeneity.json', 'w') as f: json.dump({'e':e_homogeneity, 'ne' : ne_homogeneity}, f) x_ticks = np.arange(-1,1.1, 0.1) x_ticks = np.around(x_ticks, decimals=1) e_count = [] ne_count = [] for x in x_ticks: e_count.append(e_homogeneity.count(x)) ne_count.append(ne_homogeneity.count(x)) line = LinePlot() line.set_ylog() line.set_label('Homogeneity', 'Number of Homogeneity') line.set_plot_data(e_count, x_ticks) line.set_plot_data(ne_count, x_ticks) line.set_legends(['Echo Chambers', 'Non-Echo Chambers']) line.set_xticks(x_ticks) line.save_image('Image/%s/homogeneity_line.png'%foldername)
def draw_6_3_1_figures(): with open('Data/Figure/6_3.json', 'r') as f: data = json.load(f) #json.dump({'rumor' : rumor_num, 'cascade_num' : cascade_num}, f) all_cascade_num = 48644 all_tweet_num = 310545 all_retweet_count = 264653 rumor_num = data['rumor'] cascade_num = data['cascade_num'] all_retweet_num = data['all_user'] all_retweet_median_num = data['all_median'] all_retweet_mean_num = data['all_mean'] x_ticks = range(0, len(rumor_num)) print('top 10% echo chamber ', len(rumor_num)) #print(x_ticks) #print(rumor_num) line = LinePlot() line.set_ylog() line.set_label('Rank', 'Number of Rumors') yticks = [rumor_num[i] for i in x_ticks] line.set_plot_data(yticks, x_ticks) #line.set_yticks(['0', '1 m', '5 m', '1 h', '1 day', '10 day'], index=[0,1,5,60, 24*60, 24*10*60]) line.save_image('Image/Figure/6_3_1.png') x_ticks = range(0, len(rumor_num)) line = LinePlot() line.set_ylog() line.set_label('Rank', 'Number of Cascades') yticks = [cascade_num[i] for i in x_ticks] line.set_plot_data(yticks, x_ticks) #line.set_yticks(['0', '1 m', '5 m', '1 h', '1 day', '10 day'], index=[0,1,5,60, 24*60, 24*10*60]) line.save_image('Image/Figure/6_3_2.png') print('all', len(x_ticks)) #portion of cascades echo_num = len(cascade_num) top_01 = int(echo_num * 0.01) top_1 = int(echo_num * 0.1) top_5 = int(echo_num * 0.5) top_10 = -1 top01_p = cascade_num[top_01] / all_cascade_num * 100 top1_p = cascade_num[top_1] / all_cascade_num * 100 top5_p = cascade_num[top_5] / all_cascade_num * 100 top10_p = cascade_num[top_10] / all_cascade_num * 100 #print(cascade_num) print(top01_p, top1_p, top5_p, top10_p) barplot = BarPlot(1) barplot.set_data([0, 1, 2, 3], [top01_p, top1_p, top5_p, top10_p], '') barplot.set_xticks(['0.1%', '1%', '5%', '10%']) #barplot.set_ylim(100) barplot.set_label('Hub Echo Chambers', 'Participation of Cascades (%)') barplot.save_image('Image/Figure/6_3_3.png') print(top_01, top_1, top_5) top01_n = all_retweet_num[top_01] / all_retweet_count * 100 top1_n = all_retweet_num[top_1] / all_retweet_count * 100 top5_n = all_retweet_num[top_5] / all_retweet_count * 100 top10_n = all_retweet_num[top_10] / all_retweet_count * 100 print(all_retweet_num[top_10]) print(top01_n, top1_n, top5_n, top10_n) barplot = BarPlot(1) barplot.set_multiple_data([top01_p, top1_p, top5_p, top10_p], [top01_n, top1_n, top5_n, top10_n]) barplot.set_xticks(['0.1%', '1%', '5%', '10%']) barplot.set_ylim(50) barplot.set_label('Hub Echo Chambers', 'Portion of Cascades (%)') #barplot.set_legends(['Cascade', 'Retweet'], '') barplot.save_image('Image/Figure/6_3_4.png') """