示例#1
0
def user_to_depth(user_depth, user_depth2, user_depth3):
    #time_depth = get_depth_time_series('True')    
    #time_depth2 = get_depth_time_series('False')    
    #time_depth3 = get_depth_time_series('Mixture,Mostly True,Mostly False')    

    #mean time to get to depth
    print(user_depth)
    x_ticks = np.arange(0, 20, 1)
    x_ticks1 = user_depth.keys()
    x_ticks2 = user_depth2.keys()
    x_ticks3 = user_depth3.keys()
    y_ticks1 = [np.mean(user_depth[depth].values()) for depth in x_ticks1]
    y_ticks2 = [np.mean(user_depth2[depth].values()) for depth in x_ticks2]
    y_ticks3 = [np.mean(user_depth3[depth].values()) for depth in x_ticks3]
    if len(x_ticks1) > len(x_ticks2) and len(x_ticks1) > len(x_ticks2):
        x_ticks = x_ticks1
    elif len(x_ticks2) > len(x_ticks1) and len(x_ticks2) > len(x_ticks3):
        x_ticks = x_ticks2
    else:
        x_ticks = x_ticks3
    line = LinePlot()
    line.set_ylog()
    line.set_label('Depth', 'Mean Unique Users')
    line.set_plot_data(y_ticks1, x_ticks)
    line.set_plot_data(y_ticks2, x_ticks)
    line.set_plot_data(y_ticks3, x_ticks)
    line.set_legends(['True', 'False', 'Mixed'])
    line.set_xticks(x_ticks)
    line.save_image('Image/user_depth_line.png')
示例#2
0
def draw_5_2_1_figures():
    with open('Data/Figure/5_2_1.json', 'r') as f:
        data = json.load(f)

    #print(data)
    #draw_time_to_depth_echo_chamber([echo_chamber_values['time_depth'], non_echo_chamber_values['time_depth']], ['echo chamber', 'no echo chamber'], 'median minutes', 'time_depth_echo_chamber_line')
    #x_ticks = np.arange(1,18)
    x_ticks = range(1, 18)
    line = LinePlot()
    line.set_ylog()
    line.set_label('Depth', 'Depth Increment Time')
    xtick_labels = []
    print('sadfasdfasdfasdf')
    print(len(data))
    x_tickslabel = range(0, 17)
    x_tickslabel.append('')
    for item in data:
        #yticks = [np.mean(item[depth]) for depth in x_ticks]
        yticks = [np.median(item[str(depth)]) for depth in x_ticks]
        #u_ticks1 = [np.mean(outlier.remove_outlier(item[depth])) for depth in x_ticks]
        print(yticks)
        line.set_plot_data(yticks, x_tickslabel)
    print(x_ticks)
    #print(x_ticks[0]['time_depth'])
    #print(x_ticks[1]['time_depth'])
    line.set_legends(['Echo chamber', 'Non-echo chamber'])
    line.set_xticks(x_tickslabel)
    line.set_yticks(['0', '1 m', '5 m', '1 h', '1 day', '10 day'],
                    index=[0, 1, 5, 60, 24 * 60, 24 * 10 * 60])
    line.save_image('Image/Figure/5_2_1.png')
    print(xtick_labels)
示例#3
0
def draw_time_to_depth_echo_chamber(data, legend, data_type, filename):
    x_ticks = np.arange(1,20)
    line = LinePlot()
    line.set_ylog()
    line.set_label('Depth', data_type)
    for item in data:
        #yticks = [np.mean(item[depth]) for depth in x_ticks]
        yticks = [np.median(item[depth]) for depth in x_ticks]
        #u_ticks1 = [np.mean(outlier.remove_outlier(item[depth])) for depth in x_ticks]
        line.set_plot_data(yticks, x_ticks)
    line.set_legends(legend)
    line.set_xticks(x_ticks)
    line.save_image('%s/%s.png'%(foldername, filename))
示例#4
0
def time_to_depth():
    #index = filename.replace(".json", "").split('echo_chamber')
    #print(index)
    _, _, time_depth_cascade, user_ids, cascade_depth_users = get_depth_time_series('True')    
    _, _, time_depth_cascade2, user_ids2, cascade_depth_users2 = get_depth_time_series('False')    
    _, _, time_depth_cascade3, user_ids3, cascade_depth_users3 = get_depth_time_series('Mixture,Mostly True,Mostly False')    


    t = {}; e = {}; u_all = {}
    #true, false, mixture time to depth
    t_td = {}; f_td = {}; m_td = {};
    #true, false, mixture user to depth
    t_ud = {}; f_ud = {}; m_ud = {};

    for i in range(1,20):
        t[i] = [] 
        e[i] = []
        u_all[i] = []
        t_td[i] = []
        f_td[i] = []
        m_td[i] = []
        t_ud[i] = []
        f_ud[i] = []
        m_ud[i] = []

    for key in time_depth_cascade.keys():
        for i in range(1, max(time_depth_cascade[key].keys())):
            t[i].append(time_depth_cascade[key][i]) # 1 ~ max_depth 
            t_td[i].append(time_depth_cascade[key][i]) # 1 ~ max_depth 
            try:
                u_all[i].append(cascade_depth_users[key][i])
                t_ud[i].append(cascade_depth_users[key][i])
            except KeyError :
                pass 
            
    for key in time_depth_cascade2.keys():
        for i in range(1, max(time_depth_cascade2[key].keys())):
            f_td[i].append(time_depth_cascade2[key][i]) # 1 ~ max_depth 
            try:
                f_ud[i].append(cascade_depth_users2[key][i])
            except KeyError :
                pass 
    for key in time_depth_cascade3.keys():
        for i in range(1, max(time_depth_cascade3[key].keys())):
            m_td[i].append(time_depth_cascade3[key][i]) # 1 ~ max_depth 
            try:
                m_ud[i].append(cascade_depth_users3[key][i])
            except KeyError :
                pass 

    x_ticks = np.arange(1,20)
    """
    depth_list = []
    veracity_list = []
    time_list = []
    for depth in x_ticks:
        for value in t_td[depth]:
            depth_list.append(depth)
            time_list.append(value)
            veracity_list.append('True')
        
        for value in f_td[depth]:
            depth_list.append(depth)
            time_list.append(value)
            veracity_list.append('False')

        for value in m_td[depth]:
            depth_list.append(depth)
            time_list.append(value)
            veracity_list.append('Mixed')

    df = pd.DataFrame({'time':time_list, 'depth':depth_list, 'type':veracity_list}) 
    line = LinePlot()
    line.set_sns_plot(df)
    """
    y_ticks1 = [np.median(t_td[depth]) for depth in x_ticks]
    y_ticks2 = [np.median(f_td[depth]) for depth in x_ticks]
    y_ticks3 = [np.median(m_td[depth]) for depth in x_ticks]

    print(y_ticks1)
    print(y_ticks2)
    print(y_ticks3)
   
    line = LinePlot()
    line.set_ylog()
    line.set_label('Depth', 'Median Minutes')
    line.set_plot_data(y_ticks1, x_ticks)
    line.set_plot_data(y_ticks2, x_ticks)
    line.set_plot_data(y_ticks3, x_ticks)
    line.set_legends(['True', 'False', 'Mixed'])
    line.set_xticks(x_ticks)
    line.save_image('%s/time_depth_line_echo_chamber.png'%(foldername))
    
    #number of users to depth 
    u_ticks1 = [np.mean(t_ud[depth]) for depth in x_ticks]
    u_ticks2 = [np.mean(f_ud[depth]) for depth in x_ticks]
    u_ticks3 = [np.mean(m_ud[depth]) for depth in x_ticks]
    print(u_ticks1)

    line = LinePlot()
    line.set_ylog()
    line.set_label('Depth', 'Mean Unique Users')
    line.set_plot_data(u_ticks1, x_ticks)
    line.set_plot_data(u_ticks2, x_ticks)
    line.set_plot_data(u_ticks3, x_ticks)
    line.set_legends(['True', 'False', 'Mixed'])
    line.set_xticks(x_ticks)
    line.save_image('%s/user_depth_line_echo_chamber.png'%(foldername))
示例#5
0
def draw_graph():
    depth_time1, depth_user1, unique_user_time1, cascade_depth1 = time_series('True')

    x_ticks1 = depth_time1.keys()
    y_ticks1 = [np.mean(depth_time1[depth].values()) for depth in x_ticks1]

    depth_time2, depth_user2, unique_user_time2, cascade_depth2 = time_series('False')
    
    x_ticks2 = depth_time2.keys()
    y_ticks2 = [np.mean(depth_time2[depth].values()) for depth in x_ticks1]

    #draw mean minutes - depth line plot 
    line = LinePlot()
    line.set_ylog()
    line.set_label('Depth', 'Mean Minutes')
    line.set_plot_data([y_ticks1, y_ticks2], x_ticks1)
    line.set_legends(['True', 'False'])
    line.save_image('Image/time_depth_line.png')

    x_ticks1 = unique_user_time1.keys()
    x_ticks2 = unique_user_time2.keys()
    x_ticks1 = sorted(x_ticks1)
    y_ticks1 = [np.mean(unique_user_time1[num].values()) for num in x_ticks1]
    y_ticks2 = [np.mean(unique_user_time2[num].values()) for num in x_ticks2]
    
    #draw mean minutes - unique users line plot 
    line = LinePlot()
    line.set_ylog()
    line.set_label('Unique Users', 'Mean Minutes')
    line.set_plot_data([y_ticks1, y_ticks2], x_ticks1)
    line.set_xticks(x_ticks1)
    line.set_legends(['True', 'False'])
    line.save_image('Image/time_users_line.png')

    all_depth_true = [[key] * len(depth_time1[key]) for key in depth_time1.keys()] #True
    all_depth_false = [[key] * len(depth_time2[key]) for key in depth_time2.keys()] #True
    all_depth_sum_true = []
    all_depth_sum_false = []

    for item in all_depth_true:
        all_depth_sum_true.extend(item)
    for item in all_depth_false:
        all_depth_sum_false.extend(item)

    #Depth CDF, CCDF
    #cdf = CDFPlot()
    #cdf.set_data(all_depth_sum_true, 'True')
    #cdf.set_data(all_depth_sum_false, 'False')
    #cdf.set_legends(['True', 'False'], '')
    #cdf.save_image('Image/depth_cdf.png')

    true_cascade = []
    false_cascade = []
    for postid in cascade_depth1.keys():
        for depth in cascade_depth1[postid].values(): #origin tweet : depth
            true_cascade.extend(depth)
 
    for postid in cascade_depth2.keys():
        for depth in cascade_depth2[postid].values(): #origin tweet : depth
            false_cascade.extend(depth)
   

    print('true')
    for i in range(1, 15):
        print(i, true_cascade.count(i))
    print('false')
    for i in range(1, 15):
        print(i, false_cascade.count(i))
    
    cdf = CDFPlot()
    cdf.set_legends(['True', 'False'], '')
    cdf.set_xlim(0, 11)
    #cdf.set_log(True)
    #cdf.set_ylog()
    cdf.set_label('Depth', 'CDF')
    cdf.set_data(true_cascade, 'True')
    cdf.set_data(false_cascade, 'False')
    cdf.save_image('Image/depth_cdf.png')
示例#6
0
    category_count = []
    for c in category_all:
        count_list = []
        for i in range(2011, 2018):
            count = category_count_by_year(i, c)
            count_list.append(count)
        category_count.append(count_list)
    #print(category_count)
    df = pd.DataFrame(category_count, index = category_all, columns = range(2011, 2018))
    #print(df)
    
    LinePlt = LinePlot()
    LinePlt.set_label('year', 'number of articles')
    LinePlt.set_plot_data(category_count, 'category count')
    LinePlt.set_xticks(range(2011, 2018))
    LinePlt.set_legends(category_all)
    LinePlt.save_image('./image/category_count_year.png')

    print("top key words by category")
    writer = pd.ExcelWriter('./trending_words/trending_keywords_category.xlsx', engine='xlsxwriter')
    dataframe_list = []
    category_data = []
    for item in categories:    
        words = frequency(titles_category(item))
        category_data.append(tuple_to_string(words))
    df = pd.DataFrame(category_data, index = categories, columns = range(1,11))
    dataframe_list.append(df)
        #df.to_csv('./trending_words/%s.csv'%i, encoding='utf-8')

    for i, item in enumerate(dataframe_list):
        item.to_excel(writer, sheet_name=years[i])
def edge_homogeneity():
    files = os.listdir(dir_name)
    
    retweet_cache = {}
    homogeneity = []
    for ccc, postid in enumerate(files):
        #users_polarity[postid] = {}
        with open(dir_name  + '%s'%postid, 'r') as f:
            tweets = json.load(f)
            retweet_cache[postid] = tweets

        for tweet in tweets.values():
            p_score = get_polarity(tweet['user'])

            #calculate edge homogeneity
            if tweet['depth'] != 1:
                #compare with parents if parent is not root node 
                p_score2 = get_polarity(tweet['parent'])

                if p_score == -999 or p_score2 == -999:
                    continue
                e = p_score * p_score2

                #print(p_score, p_score2, round(e, 1))
                homogeneity.append(round(e, 1))
        

    #    if ccc == 10:
    #        break

    #compare with echo chamber node's edge homogeneity
    echo_chamber_users = {}
    e_homogeneity = []
    ne_homogeneity = []
    with open('Data/echo_chamber2.json') as f:
        echo_chamber = json.load(f)

    for key in echo_chamber:
        users = echo_chamber[key]

        if len(users) < 1:
            continue

        for postid in key.split('_'):
            echo_chamber_users[postid] = echo_chamber_users.get(postid, {})
            for user in users:
                echo_chamber_users[postid][user] = 1 
 
    for postid in echo_chamber_users.keys():
        tweets = retweet_cache[postid]

        for tweet in tweets.values():
            #echo chamber user's edge homogeneity
            if tweet['depth'] != 1:
                p_score = get_polarity(tweet['user'])
                p_score2 = get_polarity(tweet['parent'])

                if p_score == -999 or p_score2 == -999:
                    continue
            
                e = p_score * p_score2

                #print(p_score, p_score2, round(e, 1))
                if tweet['user'] in echo_chamber_users[postid].keys():
                    e_homogeneity.append(e)
                    #e_homogeneity.append(round(e, 1))
                else:
                    ne_homogeneity.append(e)
                    #ne_homogeneity.append(round(e, 1))


    draw_cdf_plot([e_homogeneity, ne_homogeneity], 'Homogenety', ['Echo Chambers', 'Non-Echo Chambers'], 'User type', 'homogeneity')

    with open('Data/homogeneity.json', 'w') as f:
        json.dump({'e':e_homogeneity, 'ne' : ne_homogeneity}, f)

    x_ticks = np.arange(-1,1.1, 0.1)
    x_ticks = np.around(x_ticks, decimals=1)
    e_count = []
    ne_count = []
    for x in x_ticks:
        e_count.append(e_homogeneity.count(x))
        ne_count.append(ne_homogeneity.count(x))
    line = LinePlot()
    line.set_ylog()
    line.set_label('Homogeneity', 'Number of Homogeneity')
    line.set_plot_data(e_count, x_ticks)
    line.set_plot_data(ne_count, x_ticks)
    line.set_legends(['Echo Chambers', 'Non-Echo Chambers'])
    line.set_xticks(x_ticks)
    line.save_image('Image/%s/homogeneity_line.png'%foldername)