示例#1
0
def main():
    df = read_plot_data(overwrite=False)
    df = df.reset_index()
    for centroid_day, grp in df.groupby('centroid_day'):
        df.loc[grp.index, 'h_index'] = grp.h_index.rank(method='first',
                                                        ascending=False)
        df.loc[grp.index,
               'num_retweets'] = grp.num_retweets.rank(method='first',
                                                       ascending=False)
    df = df.groupby(['centroid_day', 'super_community']).mean().reset_index()
    palette = [
        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b',
        '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
    ]
    color_dict = {
        'International expert': palette[4],
        'National expert': palette[1],
        'Political': palette[2],
        'Other': '.5'
    }

    fig, axes = plt.subplots(1, 2, figsize=(3.3, 1.4))

    lw = .8
    ms = 4
    for sc, grp in df.groupby('super_community'):
        axes[0].plot(grp.centroid_day.values,
                     grp.h_index,
                     c=color_dict[sc],
                     label=sc,
                     ms=ms,
                     lw=lw,
                     marker='.')
        axes[1].plot(grp.centroid_day.values,
                     grp.num_retweets,
                     c=color_dict[sc],
                     label=sc,
                     ms=ms,
                     lw=lw,
                     marker='.')

    _min = 600
    _max = 3200
    axes[0].set_ylim((_max, _min))
    axes[1].set_ylim((_max, _min))

    axes[0].set_ylabel('Avg. rank h-index $r_h$')
    axes[1].set_ylabel('Avg. rank retweets $r_{rt}$')

    for ax in axes:
        ax.grid()
        ax.xaxis.set_major_locator(mdates.MonthLocator())
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
        ax.tick_params(axis='x', direction='out', which='minor', size=2)
        ax.tick_params(axis='x', direction='out', which='major', size=2)
        ax.tick_params(axis='y', which='major', direction='out', size=2)
        ax.locator_params(nbins=4)

    sns.despine()
    fig.tight_layout()

    save_fig(fig, 'fig4cd', 1, dpi=600, plot_formats=['png', 'pdf', 'svg'])
def fig4a():
    do_log = False
    palette = [
        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b',
        '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
    ]
    color_dict = {
        'International expert': palette[4],
        'National expert': palette[1],
        'Political': palette[2],
        'Other': '.5'
    }
    df = read_data(overwrite=False)

    g = sns.jointplot(data=df,
                      kind='scatter',
                      x='h_index_rank',
                      y='num_retweets_rank',
                      alpha=.8,
                      s=4,
                      ec=None,
                      hue='super_community',
                      palette=color_dict.values(),
                      hue_order=color_dict.keys(),
                      height=2,
                      marginal_ticks=False,
                      space=.1)

    num_ranks = len(df) + 50
    if do_log:
        g.ax_joint.set_xlim((num_ranks, 1))
        g.ax_joint.set_ylim((num_ranks, 1))
    else:
        g.ax_joint.set_xlim((num_ranks, -30))
        g.ax_joint.set_ylim((num_ranks, -30))
    g.ax_joint.grid()
    g.ax_joint.get_legend().remove()
    g.ax_joint.set_xlabel(r'Rank $h$-index $r_h$')
    g.ax_joint.set_ylabel('Rank retweets $r_{rt}$')

    g.ax_marg_x.tick_params(axis='x', direction='out', which='major', size=0)
    g.ax_marg_y.tick_params(axis='y', direction='out', which='major', size=0)

    g.ax_marg_y.set_xlim((None, g.ax_marg_y.get_xlim()[1] * 1.1))
    g.ax_marg_x.set_ylim((None, g.ax_marg_x.get_ylim()[1] * 1.1))

    # log scale
    if do_log:
        g.ax_joint.set_xscale('log')
        g.ax_joint.set_yscale('log')

    # diagonal line
    g.ax_joint.plot([0, num_ranks], [0, num_ranks],
                    color='0.15',
                    ls='dashed',
                    lw=.5)

    # num ticks
    num_ticks = 5
    g.ax_joint.xaxis.set_major_locator(plt.MaxNLocator(num_ticks))
    g.ax_joint.yaxis.set_major_locator(plt.MaxNLocator(num_ticks))

    save_fig(g, 'fig4a', 1, dpi=600, plot_formats=['png', 'pdf', 'svg'])
def main():
    df = get_category_data(without_other=False)
    df_entropy = get_geo_entropy()

    df = df.T
    df = df.divide(df.sum(axis=1), axis=0)
    df.columns = df.columns.tolist()
    df['Internationality'] = df_entropy['Internationality']
    df = df[[c for c in df.columns if c != 'Other']]
    max_val = df.subtract(df.mean(axis=0), axis=1).divide(df.std(axis=0),
                                                          axis=1).max().max()

    usecols = [
        'Political Supporter', 'Public Services', 'Politics & Government',
        'Media', 'Healthcare', 'Science', 'Internationality'
    ]
    df = df[usecols]

    width = 2.2
    height = 3.7
    # cmap = None
    cmap = 'RdBu_r'
    # cmap = 'Blues'
    # cmap = 'Reds'

    palette = [c['color'] for c in matplotlib.rcParams['axes.prop_cycle']]
    colors = {
        'International expert': palette[4],
        'National elites': palette[1],
        'Political': palette[2],
        'Other': '.5'
    }
    #df.index.name = 'communities'
    super_communities = [
        community_to_types[com] for com in df.index
    ]  #df.reset_index().communities.apply(lambda s: community_to_types[s]).copy()
    super_community_colors = [colors[sc] for sc in super_communities
                              ]  #super_communities.apply(lambda s: colors[s])

    df.to_csv('df_cat.csv')

    X = df.values
    Z = scipy.stats.zscore(X, axis=0)
    Z[Z < .5] = 0
    f = plt.figure(dpi=300)
    sns.heatmap(Z, annot=True, xticklabels=usecols, yticklabels=alphab[:15])
    plt.savefig('../plots/Zmatrix.png')
    print(Z)
    for i, com in enumerate(list(df.index)):
        for j, cat in enumerate(usecols):
            if Z[i, j] > 0:
                print('Com {} exceed of {}'.format(com, cat))

    Xpca = PCA().fit_transform(Z)

    fig, axes23 = plt.subplots(2, 3, figsize=(6, 4))
    for method, axes in zip(['average', 'ward'], axes23):
        z = hierarchy.linkage(Z, method=method)

        # Plotting
        axes[0].plot(range(1, len(z) + 1), z[::-1, 2], label='distance')
        knee = np.diff(z[::-1, 2], 2)
        axes[0].plot(range(2, len(z)), knee, label='2nd deriv of distance')
        axes[0].legend(fontsize=4)

        num_clust1 = knee.argmax() + 2
        knee[knee.argmax()] = 0
        num_clust2 = knee.argmax() + 2
        print(num_clust1)
        print(num_clust2)

        axes[0].text(num_clust1, z[::-1, 2][num_clust1 - 1],
                     'possible\n<- knee point')

        part1 = hierarchy.fcluster(z, num_clust1, 'maxclust')
        part2 = hierarchy.fcluster(z, num_clust2, 'maxclust')

        clr = [
            '#2200CC', '#D9007E', '#FF6600', '#FFCC00', '#ACE600', '#0099CC',
            '#8900CC', '#FF0000', '#FF9900', '#FFFF00', '#00CC01', '#0055CC',
            'red'
        ]

        for part, ax in zip([part1, part2], axes[1:]):
            for cluster in set(part):
                ax.scatter(Xpca[part == cluster, -2],
                           Xpca[part == cluster, -1],
                           color=clr[cluster])
            for i in range(Xpca.shape[0]):
                ax.annotate(alphab[i], (Xpca[i, -2], Xpca[i, -1]))
        m = '\n(method: {})'.format(method)
        plt.setp(axes[0],
                 title='Screeplot{}'.format(m),
                 xlabel='partition',
                 ylabel='{}\ncluster distance'.format(m))
        plt.setp(axes[1], title='{} Clusters'.format(num_clust1))
        plt.setp(axes[2], title='{} Clusters'.format(num_clust2))

        plt.tight_layout()
        plt.savefig('../plots/linkage.png')

    row_linkage = hierarchy.linkage(Z, method='average', metric='euclidean')
    #print(row_linkage)
    #print(df.index)
    clus = hierarchy.fcluster(row_linkage, t=1.1, depth=2)
    print('Nclus: ', len(set(clus)), clus)
    clus = hierarchy.fcluster(row_linkage, t=1.2, depth=3)
    print('Nclus: ', len(set(clus)), clus)
    clus = hierarchy.fcluster(row_linkage, t=3.1, criterion='distance')
    print('Nclus: ', len(set(clus)), clus)
    clus = hierarchy.fcluster(row_linkage, t=3.3, criterion='distance')
    print('Nclus: ', len(set(clus)), clus)

    print(df.index)

    g = sns.clustermap(
        df,  #.reset_index(drop=True),
        method='ward',
        dendrogram_ratio=.3,
        colors_ratio=.04,
        cmap=cmap,
        center=0,
        lw=0.4,
        col_cluster=False,
        z_score=1,
        figsize=(width, height),
        cbar_pos=(1.1, .33, .02, .2),
        row_colors=super_community_colors)

    fig = plt.gcf()
    fig.delaxes(g.ax_col_dendrogram)

    g.ax_row_colors.get_xaxis().set_visible(False)

    # colorbar formatting
    g.ax_cbar.set_ylabel('Z-score', rotation=0, ha='left', va='center')
    g.ax_cbar.locator_params(nbins=5)
    g.ax_cbar.tick_params(length=2,
                          axis='both',
                          which='major',
                          direction='out')

    # create type legend
    type_legend_patches = [
        mpatches.Patch(color=c, label=l) for l, c in colors.items()
    ]
    labels = list(colors.keys())
    #labels[labels.index('National expert')] = 'National elite'
    legend_type = g.ax_heatmap.legend(labels=labels,
                                      loc='center left',
                                      bbox_to_anchor=(1.3, .85),
                                      handles=type_legend_patches,
                                      frameon=False,
                                      title='Super-community',
                                      handlelength=.6,
                                      handleheight=1)

    # move axis labels
    g.ax_heatmap.set_xticklabels(g.ax_heatmap.get_xticklabels(),
                                 rotation=75,
                                 ha='right')
    g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
    #g.ax_heatmap.set_yticklabels(g.ax_heatmap.get_yticklabels(), rotation=0)
    g.ax_row_colors.set_xticklabels(g.ax_row_colors.get_xticklabels(),
                                    rotation=75,
                                    ha='right')
    offset = matplotlib.transforms.ScaledTranslation(.05, 0,
                                                     fig.dpi_scale_trans)
    for ax in [g.ax_heatmap, g.ax_row_colors]:
        for label in ax.xaxis.get_majorticklabels():
            label.set_transform(label.get_transform() + offset)

    # other cosmetics
    g.ax_heatmap.tick_params(axis='both',
                             which='both',
                             bottom=False,
                             right=False)
    g.ax_row_colors.tick_params(axis='both',
                                which='both',
                                bottom=False,
                                right=False)
    g.ax_heatmap.set_ylabel('Community')

    save_fig(fig, 'fig2', 1, plot_formats=['png', 'pdf'], dpi=600)
def fig4b():
    palette = [
        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b',
        '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
    ]
    color_dict = {
        'International expert': palette[4],
        'National expert': palette[1],
        'Political': palette[2],
        'Other': '.5'
    }
    df = read_data()
    fig, ax = plt.subplots(1, 1, figsize=(2, 2), sharex=False, sharey=False)

    for _t, grp in df.groupby('super_community'):
        means = grp.mean()
        ci = bootstrap_ci(grp, ['h_index_rank', 'num_retweets_rank'])
        for k, v in ci.items():
            ci[k] = [[abs(means[k] - v[0])], [abs(means[k] - v[1])]]
        ax.errorbar(means['h_index_rank'],
                    means['num_retweets_rank'],
                    xerr=ci['h_index_rank'],
                    yerr=ci['num_retweets_rank'],
                    label=_t,
                    c=color_dict[_t],
                    ms=5,
                    marker='.',
                    capsize=.8,
                    elinewidth=.5,
                    markeredgewidth=.5)

    num_ranks = len(df)
    ax.set_xlim((num_ranks, 0))
    ax.set_ylim((num_ranks, 0))
    ax.grid(True)

    # num ticks
    num_ticks = 4
    ax.xaxis.set_major_locator(plt.MaxNLocator(num_ticks))
    ax.yaxis.set_major_locator(plt.MaxNLocator(num_ticks))

    # annotate
    len_arr = 500
    center = num_ranks / 2
    arrow_props = dict(fc='.15',
                       shrink=.1,
                       width=.15,
                       headwidth=2,
                       headlength=2)
    ax.annotate('',
                xy=(center - len_arr, center + len_arr),
                xytext=(center, center),
                ha='left',
                va='top',
                arrowprops=arrow_props)
    ax.annotate('',
                xy=(center + len_arr, center - len_arr),
                xytext=(center, center),
                ha='left',
                va='top',
                arrowprops=arrow_props)
    ax.text(center + len_arr - 200,
            center - len_arr,
            'Over-ranked\nby retweets\n(high virality)',
            ha='right',
            va='bottom',
            fontsize=6)
    ax.text(center - len_arr,
            center + len_arr,
            'Under-ranked\nby retweets\n(low virality)',
            ha='left',
            va='top',
            fontsize=6)

    # axis labels
    ax.set_xlabel(r'Avg. rank $h$-index $r_h$')
    ax.set_ylabel('Avg. rank retweets $r_{rt}$')

    sns.despine()
    ax.set_aspect('equal')

    # diagonal line
    ax.plot([0, num_ranks], [0, num_ranks],
            color='0.15',
            ls='dashed',
            lw=.5,
            zorder=0)

    save_fig(fig, 'fig4b', 2, dpi=600, plot_formats=['png', 'pdf', 'svg'])
def main():
    # read data
    df_int = get_interaction_data()
    df_size = get_community_size_data()
    df_size = df_size[df_int.index.min():df_int.index.max()]
    df_size['Total'] = df_size.sum(axis=1)
    dfs = {}
    for _type in ['inter_cluster', 'intra_cluster', 'received']:
        df = df_int[df_int.columns[df_int.columns.str.startswith(_type)]].copy()
        col_rename = {}
        for col in df.columns:
            col_rename[col] = col.split('_')[-1]
        df = df.rename(columns=col_rename)
        df['Total'] = df.sum(axis=1)
        dfs[_type] = df

    col_order = ['Total', 'International expert', 'National expert', 'Political', 'Other']
    palette = [c['color'] for c in matplotlib.rcParams['axes.prop_cycle']]
    # orange, blue,
    color_dict = {'International expert': palette[4], 'National expert': palette[1], 'Political': palette[2], 'Other': '.5', 'Total': '0.15'}
    ls_dict = {'International expert': '-', 'National expert': '-', 'Political': '-', 'Other': '-', 'Total': 'dotted'}
    m_dict = {'International expert': '.', 'National expert': '.', 'Political': '.', 'Other': '.', 'Total': None}
    z_order = {'International expert': 4, 'National expert': 3, 'Political': 2, 'Other': 1,  'Total': 0}
    colors = [color_dict[c] for c in col_order]
    col_order_rev = col_order[::-1]
    colors_rev = [color_dict[c] for c in col_order_rev]

    # plot
    fig, all_axes = plt.subplots(2, 2, figsize=(4.5, 3.5), sharex=False, sharey=False)

    # specs
    lw = 1
    ec = 'white'
    ms = 5

    # panels A
    ax = all_axes[0][0]
    for col in df[col_order]:
        ax.plot(df_size.index.values, df_size[col].values, label=col, color=color_dict[col], marker=m_dict[col], ms=ms, lw=lw, ls=ls_dict[col], zorder=z_order[col])
    ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
    ax.set_ylabel('$N$')
    ax.set_title('Number of users', fontsize=7)
    ax.set_ylim((0, 0.6*10**7))
    ax.grid()

    # panels B
    ax = all_axes[0][1]
    df = dfs['received']
    df /= df_size # normalize by size of type
    for col in df[col_order]:
        ax.plot(df.index.values, df[col].values, label=col, color=color_dict[col], marker=m_dict[col], ms=ms, lw=lw, ls=ls_dict[col], zorder=z_order[col])
    ax.set_ylabel('$A_u$')
    ax.set_title('Avg. attention per user', fontsize=7)
    ax.set_ylim((0, 8))
    ax.grid()

    # panels C
    df = dfs['inter_cluster']
    ax = all_axes[1][0]
    for col in df[col_order]:
        ax.plot(df.index.values, df[col], label=col, color=color_dict[col], marker=m_dict[col], ms=ms, lw=lw, ls=ls_dict[col], zorder=z_order[col])

    ax.set_title('External attention component', fontsize=7)
    ax.set_ylabel(r'$a^{ext}$')
    ax.set_xlabel('')
    ax.grid()
    ax.set_ylim((0, .35))

    # panels D
    df = dfs['intra_cluster']
    ax = all_axes[1][1]
    for col in df[col_order]:
        ax.plot(df.index.values, df[col], label=col, color=color_dict[col], marker=m_dict[col], ms=ms, lw=lw, ls=ls_dict[col], zorder=z_order[col])

    ax.set_title('Internal attention component', fontsize=7)
    ax.set_ylabel(r'$a^{int}$')
    ax.set_xlabel('')
    ax.grid()
    ax.set_ylim((0, .95))

    # vertical lines
    for i, time_pos in enumerate([2, 9]):
        for ax_row in all_axes:
            for ax in ax_row:
                ax.axvline(df.index[time_pos], ls='dashed', lw=.5, c='.15', zorder=0)
                ax.annotate(chr(97 + i), (df.index[time_pos], ax.get_ylim()[1]*.95), xytext=(2, 0), textcoords='offset points', ha='left', va='center')

    # tick style
    for ax_row in all_axes:
        for ax in ax_row:
            ax.xaxis.set_major_locator(mdates.MonthLocator())
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%b'))
            ax.tick_params(axis='x', direction='out', which='minor', size=2)
            ax.tick_params(axis='x', direction='out', which='major', size=2)
            ax.tick_params(axis='y', which='major', direction='out', size=2)
    sns.despine()
    fig.tight_layout()
    fig.subplots_adjust(hspace=.5)
    save_fig(plt.gcf(), 'fig3', 1, dpi=600, plot_formats=['png', 'pdf'])