Пример #1
0
def plot_basics(data, data_inst, fig, units):
    '''
    This function is the main plotting function. Adapted from Newman's powerlaw package.
    '''
    import pylab
    pylab.rcParams['xtick.major.pad']='8'
    pylab.rcParams['ytick.major.pad']='8'
    pylab.rcParams['font.sans-serif']='Arial'

    from matplotlib import rc
    rc('font', family='sans-serif')
    rc('font', size=10.0)
    rc('text', usetex=False)

    from matplotlib.font_manager import FontProperties

    panel_label_font = FontProperties().copy()
    panel_label_font.set_weight("bold")
    panel_label_font.set_size(12.0)
    panel_label_font.set_family("sans-serif")

    n_data = 1
    n_graphs = 4
    from powerlaw import plot_pdf, Fit, pdf
    ax1 = fig.add_subplot(n_graphs,n_data,data_inst)
    x, y = pdf(data, linear_bins=True)
    ind = y>0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5, label='data')
    plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2, label='PDF')
    from pylab import setp
    setp( ax1.get_xticklabels(), visible=False)
    plt.legend(loc = 'bestloc')

    ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1)
    plot_pdf(data[data>0], ax=ax2, color='b', linewidth=2, label='PDF')
    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g',label='w/o xmin')
    p = fit.power_law.pdf()

    ax2.set_xlim(ax1.get_xlim())
    fit = Fit(data, discrete=True,xmin=3)
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g', label='w xmin')
    from pylab import setp
    setp(ax2.get_xticklabels(), visible=False)
    plt.legend(loc = 'bestloc')

    ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst)#, sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g',label='powerlaw')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r',label='exp')
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)

    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())
    plt.legend(loc = 'bestloc')
    ax3.set_xlabel(units)
Пример #2
0
	def draw_plots(self):
		from matplotlib import pyplot as plt

		fig = plt.figure(figsize=(4, 4))
		ax = fig.add_subplot(111)
		data = self.on_data()
		from powerlaw import Fit
		experimental = Fit(data, xmin=min(data))
		experimental.plot_ccdf(ax=ax)

		plt.show()
Пример #3
0
    def draw_plots(self):
        from matplotlib import pyplot as plt

        fig = plt.figure(figsize=(4, 4))
        ax = fig.add_subplot(111)
        data = self.on_data()
        from powerlaw import Fit
        experimental = Fit(data, xmin=min(data))
        experimental.plot_ccdf(ax=ax)

        plt.show()
Пример #4
0
def plot_powerlaw_combined(data, data_inst, fig, units):
	from powerlaw import plot_pdf, Fit, pdf
	annotate_coord = (-.4, .95)
	ax1 = fig.add_subplot(n_graphs,n_data,data_inst)
	plot_pdf(data, ax=ax1, color='b', linewidth=2)
	
	fit = Fit(data, xmin=1, discrete=True)
	fit.power_law.plot_pdf(ax=ax1, linestyle=':', color='g')
	p = fit.power_law.pdf()

	fit = Fit(data, discrete=True)
	fit.power_law.plot_pdf(ax=ax1, linestyle='--', color='g')

	from pylab import setp
	setp( ax1.get_xticklabels(), visible=False)

	if data_inst==1:
	   ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontsize=14)        
	   ax1.set_ylabel(r"$p(X)$")# (10^n)")

	ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst)#, sharex=ax1)#, sharey=ax2)
	fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
	fit.exponential.plot_pdf(ax=ax2, linestyle='--', color='r')
	fit.plot_pdf(ax=ax2, color='b', linewidth=2)
	
	ax2.set_ylim(ax1.get_ylim())
	ax2.set_yticks(ax2.get_yticks()[::2])
	ax2.set_xlim(ax1.get_xlim())
	
	if data_inst==1:
		ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontsize=14)

	ax2.set_xlabel(units)
Пример #5
0
def plot_basics(data, data_inst, fig, units):
    '''
    This function is the main plotting function. Adapted from Newman's powerlaw package.
    '''
    import pylab
    pylab.rcParams['xtick.major.pad'] = '8'
    pylab.rcParams['ytick.major.pad'] = '8'
    pylab.rcParams['font.sans-serif'] = 'Arial'

    from matplotlib import rc
    rc('font', family='sans-serif')
    rc('font', size=10.0)
    rc('text', usetex=False)

    from matplotlib.font_manager import FontProperties

    panel_label_font = FontProperties().copy()
    panel_label_font.set_weight("bold")
    panel_label_font.set_size(12.0)
    panel_label_font.set_family("sans-serif")

    n_data = 1
    n_graphs = 4
    from powerlaw import plot_pdf, Fit, pdf
    ax1 = fig.add_subplot(n_graphs, n_data, data_inst)
    x, y = pdf(data, linear_bins=True)
    ind = y > 0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5, label='data')
    plot_pdf(data[data > 0], ax=ax1, color='b', linewidth=2, label='PDF')
    from pylab import setp
    setp(ax1.get_xticklabels(), visible=False)
    plt.legend(loc='bestloc')

    ax2 = fig.add_subplot(n_graphs, n_data, n_data + data_inst, sharex=ax1)
    plot_pdf(data[data > 0], ax=ax2, color='b', linewidth=2, label='PDF')
    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g', label='w/o xmin')
    p = fit.power_law.pdf()

    ax2.set_xlim(ax1.get_xlim())
    fit = Fit(data, discrete=True, xmin=3)
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g', label='w xmin')
    from pylab import setp
    setp(ax2.get_xticklabels(), visible=False)
    plt.legend(loc='bestloc')

    ax3 = fig.add_subplot(n_graphs, n_data,
                          n_data * 2 + data_inst)  #, sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g', label='powerlaw')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r', label='exp')
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)

    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())
    plt.legend(loc='bestloc')
    ax3.set_xlabel(units)
Пример #6
0
def distribution_compare_dict(fit: powerlaw.Fit) -> Dict[str, float]:
    """
    Compose a dict of length distribution fit comparisons.
    """
    compare_dict = dict()
    for dist_enum_pairs in [
        (Dist.POWERLAW, Dist.LOGNORMAL),
        (Dist.POWERLAW, Dist.EXPONENTIAL),
        (Dist.LOGNORMAL, Dist.EXPONENTIAL),
        (Dist.POWERLAW, Dist.TRUNCATED_POWERLAW),
    ]:
        first, second = dist_enum_pairs[0].value, dist_enum_pairs[1].value
        r, p = fit.distribution_compare(first, second, normalized_ratio=True)
        compare_dict[f"{first} vs. {second} R"] = r
        compare_dict[f"{first} vs. {second} p"] = p
    return compare_dict
def plot_basics(data, data_inst, fig, units):
    from powerlaw import plot_pdf, Fit, pdf
    annotate_coord = (-.4, .95)
    ax1 = fig.add_subplot(n_graphs,n_data,data_inst)
    x, y = pdf(data, linear_bins=True)
    ind = y>0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)
    plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2)
    from pylab import setp
    setp( ax1.get_xticklabels(), visible=False)

    if data_inst==1:
        ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font)

    
    from mpl_toolkits.axes_grid.inset_locator import inset_axes
    ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3)
    ax1in.hist(data, normed=True, color='b')
    ax1in.set_xticks([])
    ax1in.set_yticks([])

    
    ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1)
    plot_pdf(data, ax=ax2, color='b', linewidth=2)
    fit = Fit(data, xmin=1, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g')
    p = fit.power_law.pdf()

    ax2.set_xlim(ax1.get_xlim())
    
    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
    from pylab import setp
    setp( ax2.get_xticklabels(), visible=False)

    if data_inst==1:
       ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font)        
       ax2.set_ylabel(u"p(X)")# (10^n)")
        
    ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst)#, sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r')
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)
    
    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())
    
    if data_inst==1:
        ax3.annotate("C", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font)

    ax3.set_xlabel(units)
Пример #8
0
def normalize_fit_to_area(
    fit: powerlaw.Fit, length_distribution: LengthDistribution
) -> Tuple[np.ndarray, np.ndarray]:
    """
    Normalize powerlaw.fit ccdf to area value.
    """
    # Get the full length data along with full ccm using the original
    # data instead of fitted (should be same with cut_off==0.0)
    full_length_array, full_ccm_array = fit.ccdf(original_data=True)

    # Get boolean array where length is over cut_off
    are_over_cut_off = full_length_array > fit.xmin

    assert isinstance(are_over_cut_off, np.ndarray)
    assert sum(are_over_cut_off) > 0

    # Cut lengths and corresponding ccm to indexes where are over cut off
    truncated_length_array = full_length_array[are_over_cut_off]
    ccm_array = full_ccm_array[are_over_cut_off]

    area_value = length_distribution.area_value
    assert area_value > 0
    # Normalize ccm with area value
    logging.info(
        "Normalizing ccm with area_value.",
        extra=dict(
            area_value=area_value,
            ccm_array_description=pd.Series(ccm_array).describe().to_dict(),
        ),
    )
    ccm_array_normed = ccm_array / area_value

    logging.info(
        "Normalized fit ccm.",
        extra=dict(
            sum_are_over_cut_off=sum(are_over_cut_off),
            fit_xmin=fit.xmin,
            amount_filtered=len(full_length_array) - len(truncated_length_array),
            length_distribution_area_value=area_value,
        ),
    )

    return truncated_length_array, ccm_array_normed
Пример #9
0
 def clust_powlaw(self, G):
   # Checks if degree distribution follows power law distribution
   # Returns value of gamma for graph G
   gamma = []
   fit = Fit(sorted(G.degree().values()))
   return fit.power_law.alpha
Пример #10
0
def plplot(data, title, save=False, save_path=None):
    data = np.array(data)

    fig = plt.figure(figsize=(18,6))
    fig.suptitle(title)
    
    # === A ===
    ax1 = fig.add_subplot(1,3,1)

    # 线性x轴
    x, y = pdf(data, linear_bins=True)
    ind = y>0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)

    # 双log-绘制概率密度曲线
    plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2)

    ax1.set_xlabel('A')
    
    # 绘制histogram小图
    from mpl_toolkits.axes_grid.inset_locator import inset_axes
    ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3)
    ax1in.hist(data, normed=True, color='b')
    ax1in.set_xticks([])
    ax1in.set_yticks([])

    # === A ===

    # === B ===
    
    annotation = ''
    ax2 = fig.add_subplot(1,3,2, sharey=ax1)

    # 双log-绘制概率密度曲线
    print(title)
    print(pdf(data))
    print()
    plot_pdf(data, ax=ax2, color='b', linewidth=2)

    # 拟合power-law函数并绘图
    fit = Fit(data, xmin=1, discrete=True, parameter_range={'alpha':[None,None]})
    fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g')
    params1 = (fit.power_law.alpha, fit.power_law.xmin, fit.power_law.sigma)

    # alpha为拟合系数
    # xmin表示最小的x值(使不为0),此处指定为1
    # sigma为标准差
    annotation += '\':\' - alpha={:.2f}, xmin= {}, sigma={:.2f}'.format(*params1)
    # p = fit.power_law.pdf()
    
    fit = Fit(data, discrete=True, parameter_range={'alpha':[-5,10]})
    # 区别于ax2中的第一条拟合线 - 此处的xmin并非指定,而是自动计算的optimal
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
    params2 = (fit.power_law.alpha, fit.power_law.xmin, fit.power_law.sigma)
    annotation += '\n\'--\' - alpha={:.2f}, xmin= {}, sigma={:.2f}'.format(*params2)

    ax2.set_xlabel('B')
    ax2.set_ylabel(u"p(X)")# (10^n)")
    ax2.set_xlim(ax1.get_xlim())
    annotate_coord = (0.05, 0.88)
    ax2.annotate(annotation, annotate_coord, xycoords="axes fraction")
        
    # === B ===

    # === C ===

    ax3 = fig.add_subplot(1,3,3, sharey=ax1)#, sharex=ax1)#, sharey=ax2)
    plot_pdf(data[data>0], ax=ax3, color='b', linewidth=2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r')

    
    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())

    ax3.set_xlabel('C')

    # === C ===

    if save:
        plt.savefig(save_path)
    else:
        plt.show()

    return params1, params2
Пример #11
0
    print "sum :%g, mean :%g" % (np.sum(data), np.mean(data))
    return data


#--------------------------------------------------------------#
fig, ax = pl.subplots(1, figsize=(8, 10))

N = 5000
n = -2.6
xmin, xmax = 2.0, 10000.0
seed = 1234785

data = generate_power_law_dist(N, n, xmin, xmax, seed)

counter = collections.Counter(data)
pk = counter.values()
k = counter.keys()
pk = np.asarray(pk) / float(np.sum(pk))

fit = Fit(data)
fit.power_law.plot_pdf(ax=ax, linestyle=':', color='g')
# fit = Fit(data)
print fit.power_law.alpha
print fit.power_law.sigma

ax.loglog(k, pk, '.')
plot_pdf(data, color='r')

pl.show()
def plot_basics(data, data_inst, fig, units):
    from powerlaw import plot_pdf, Fit, pdf
    annotate_coord = (-.1, .95)
    # annotate_coord = (1.1, .95)

    ax1 = fig.add_subplot(n_graphs, n_data, data_inst, visible=False)
    x, y = pdf(data, linear_bins=True)
    ind = y > 0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)
    plot_pdf(data[data > 0], ax=ax1, color='b', linewidth=2)
    from pylab import setp
    setp(ax1.get_xticklabels(), visible=False)

    # ABC
    # if data_inst == 1:
    # ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontproperties=panel_label_font)
    # ax1.set_ylabel(u"p(X)")
    # from mpl_toolkits.axes_grid.inset_locator import inset_axes
    # ax1in = inset_axes(ax1, width="30%", height="30%", loc=3)
    # ax1in.hist(data, density=True, color='b')
    # ax1in.set_xticks([])
    # ax1in.set_yticks([])
    # ax1.set_xlabel(units)

    ax2 = fig.add_subplot(n_graphs,
                          n_data,
                          n_data + data_inst,
                          sharex=ax1,
                          visible=False)
    plot_pdf(data, ax=ax2, color='b', linewidth=2, label="pdf of data")
    fit = Fit(data, xmin=1, discrete=True)
    fit.power_law.plot_pdf(ax=ax2,
                           linestyle=':',
                           color='g',
                           label="power law fit")
    p = fit.power_law.pdf()

    ax2.set_xlim(ax1.get_xlim())

    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2,
                           linestyle='--',
                           color='g',
                           label="power law fit--opt xmin")
    from pylab import setp
    setp(ax2.get_xticklabels(), visible=True)

    # if data_inst == 1:
    ax2.annotate("B",
                 annotate_coord,
                 xycoords="axes fraction",
                 fontproperties=panel_label_font)
    ax2.set_ylabel(u"p(X)")  # (10^n)")
    handles, labels = ax2.get_legend_handles_labels()
    ax2.legend(handles, labels, loc=3)
    ax2.set_xlabel(units)

    ax3 = fig.add_subplot(n_graphs, n_data, n_data * 2 +
                          data_inst)  # , sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3,
                           linestyle='--',
                           color='g',
                           label="power law fit\n(opt-min)")
    fit.exponential.plot_pdf(ax=ax3,
                             linestyle='--',
                             color='r',
                             label="exponential fit\n(opt-min)")

    fit.plot_pdf(ax=ax3, color='b', linewidth=2, label="PDF\n(opt-min)")

    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())
    handles, labels = ax3.get_legend_handles_labels()
    ax3.legend(handles, labels, loc=3, fontsize=12)
    ax3.set_xlabel(units, fontsize=15)

    # if data_inst == 1:
    ax3.annotate("C",
                 annotate_coord,
                 xycoords="axes fraction",
                 fontproperties=panel_label_font)
    ax3.set_ylabel(u"p(X)", fontsize=15)
Пример #13
0
def plot_basics(data, data_inst, fig, units):

    ### Setup ###
    from powerlaw import plot_pdf, Fit, pdf
    import pylab
    pylab.rcParams['xtick.major.pad'] = '8'
    pylab.rcParams['ytick.major.pad'] = '8'
    #pylab.rcParams['font.sans-serif']='Arial'

    from matplotlib.font_manager import FontProperties

    panel_label_font = FontProperties().copy()
    panel_label_font.set_weight("bold")
    panel_label_font.set_size(30.0)
    panel_label_font.set_family("sans-serif")
    n_data = 2
    n_graphs = 4
    annotate_coord = (-.4, .95)
    #############

    ax1 = fig.add_subplot(n_graphs, n_data, data_inst)
    x, y = pdf(data, linear_bins=True)
    ind = y > 0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)
    plot_pdf(data[data > 0], ax=ax1, color='b', linewidth=2)
    from pylab import setp
    setp(ax1.get_xticklabels(), visible=False)

    if data_inst == 1:
        ax1.annotate("A",
                     annotate_coord,
                     xycoords="axes fraction",
                     fontproperties=panel_label_font)

    ax2 = fig.add_subplot(n_graphs, n_data, n_data + data_inst, sharex=ax1)

    plot_pdf(data, ax=ax2, color='b', linewidth=2)
    fit = Fit(data, xmin=1, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
    _ = fit.power_law.pdf()
    ax2.set_xlim((1, max(x)))

    setp(ax2.get_xticklabels(), visible=False)

    if data_inst == 1:
        ax2.annotate("B",
                     annotate_coord,
                     xycoords="axes fraction",
                     fontproperties=panel_label_font)
        ax2.set_ylabel(u"p(X)")  # (10^n)")

    ax3 = fig.add_subplot(n_graphs, n_data,
                          n_data * 2 + data_inst)  #, sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r')
    fit.lognormal.plot_pdf(ax=ax3, linestyle=':', color='r')
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)

    ax3.set_ylim(ax2.get_ylim())
    ax3.set_xlim(ax1.get_xlim())

    if data_inst == 1:
        ax3.annotate("C",
                     annotate_coord,
                     xycoords="axes fraction",
                     fontproperties=panel_label_font)

    ax3.set_xlabel(units)
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    target_day_indices = [0, 15, 30, 45]
    color_cycle_4 = ColorPalette.CC4
    date_labels = [
        'Sep 01, 2018', 'Sep 16, 2018', 'Oct 01, 2018', 'Oct 16, 2018'
    ]

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    num_videos = data_loader.num_videos

    target_day_view_list = [[], [], [], []]
    for embed in range(num_videos):
        for target_idx, target_day in enumerate(target_day_indices):
            target_day_view_list[target_idx].append(
                embed_view_dict[embed][target_day])

    # == == == == == == Part 3: Load dynamic network snapshot == == == == == == #
    embed_indegree_dict = {
        embed: np.zeros((T, ))
        for embed in np.arange(num_videos)
    }  # daily indegree for each embed
    zero_indegree_list = []  # percentage of zero indegree for each day
    num_edges_list = []  # number of total edges for each day
    for t in range(T):
        filename = 'network_{0}.p'.format(
            (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d'))
        indegree_list = []
        with open(os.path.join(data_prefix, 'network_pickle', filename),
                  'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src), ...]
            for tar_embed in range(num_videos):
                indegree_value = len(
                    [1 for x in network_dict[tar_embed] if x[1] < NUM_REL])
                embed_indegree_dict[tar_embed][t] = indegree_value
                indegree_list.append(indegree_value)
        indegree_counter = Counter(indegree_list)
        zero_indegree_list.append(indegree_counter[0] / num_videos)
        num_edges_list.append(sum(indegree_list))
        print('>>> Finish loading day {0}...'.format(t + 1))
    print('>>> Network structure has been loaded!')
    print('\n>>> Average number of edges: {0:.0f}, max: {1:.0f}, min: {2:.0f}'.
          format(
              sum(num_edges_list) / len(num_edges_list), max(num_edges_list),
              min(num_edges_list)))

    fig, axes = plt.subplots(1, 3, figsize=(12, 4.5))
    ax1, ax2, ax3 = axes.ravel()

    # == == == == == == Part 4: Plot ax1 indegree CCDF == == == == == == #
    embed_avg_indegree_dict = defaultdict(float)
    for t in range(T):
        for embed in range(num_videos):
            embed_avg_indegree_dict[embed] += embed_indegree_dict[embed][t] / T

    indegree_ranked_embed_list = [
        x[0] for x in sorted(embed_avg_indegree_dict.items(),
                             key=lambda kv: kv[1],
                             reverse=True)
    ]
    top_20_indegree_embeds = indegree_ranked_embed_list[:20]
    popular_ranked_embed_list = [
        x[0] for x in sorted(
            embed_avg_view_dict.items(), key=lambda kv: kv[1], reverse=True)
    ]
    top_20_popular_embeds = popular_ranked_embed_list[:20]

    for target_idx, target_day in enumerate(target_day_indices):
        indegree_list = []
        for embed in range(num_videos):
            indegree_list.append(embed_indegree_dict[embed][target_day])

        print(
            'video with 10 indegree has more in-links than {0:.2f}% videos on date {1}'
            .format(percentileofscore(indegree_list, 10),
                    date_labels[target_idx]))
        print(
            'video with 20 indegree has more in-links than {0:.2f}% videos on date {1}'
            .format(percentileofscore(indegree_list, 20),
                    date_labels[target_idx]))

        plot_ccdf(indegree_list,
                  ax=ax1,
                  color=color_cycle_4[target_idx],
                  label=date_labels[target_idx])

    # compute the powerlaw fit
    powerlaw_fit = Fit(list(embed_avg_indegree_dict.values()))
    infer_alpha = powerlaw_fit.power_law.alpha
    p = powerlaw_fit.power_law.ccdf()
    ins_x_axis = powerlaw_fit.power_law.__dict__['parent_Fit'].__dict__[
        'data'][:int(0.9 * len(p))]
    ins_y_axis = 0.1 * p[:int(0.9 * len(p))]

    ax1.plot(ins_x_axis, ins_y_axis, 'k:')
    ax1.text(0.4,
             0.6,
             r'$x^{{{0:.2f}}}$'.format(-infer_alpha + 1),
             size=12,
             ha='right',
             va='bottom',
             transform=ax1.transAxes)
    ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.set_xlabel('indegree', fontsize=11)
    ax1.set_ylabel('$P(X) \geq x$', fontsize=11)
    ax1.tick_params(axis='both', which='major', labelsize=10)
    ax1.set_title('(a) indegree distribution', fontsize=12)

    ax1.legend(frameon=False, fontsize=11, ncol=1, fancybox=False, shadow=True)

    mean_zero_indegree = sum(zero_indegree_list) / len(zero_indegree_list)

    ax1.axhline(y=1 - mean_zero_indegree, color='k', linestyle='--', zorder=30)
    ax1.text(0.96,
             0.9,
             '{0:.0f}% with 0 indegree'.format(mean_zero_indegree * 100),
             size=11,
             transform=ax1.transAxes,
             ha='right',
             va='top')

    # == == == == == == Part 5: Plot ax2 views distribution == == == == == == #
    for target_idx, views_list in enumerate(target_day_view_list):
        x_values = range(100)
        y_values = [np.percentile(views_list, x) for x in x_values]
        ax2.plot(x_values,
                 y_values,
                 color=color_cycle_4[target_idx],
                 label=date_labels[target_idx])
    ax2.set_yscale('log')
    ax2.set_xlabel('views percentile', fontsize=11)
    ax2.set_ylabel('num of views', fontsize=11)
    ax2.tick_params(axis='both', which='major', labelsize=10)
    ax2.set_title('(b) daily views vs. its percentile', fontsize=12)

    avg_views_list = sorted(list(embed_avg_view_dict.values()), reverse=True)
    gini_coef = gini(avg_views_list)
    print('top 1% videos occupy {0:.2f}% views'.format(
        sum(avg_views_list[:int(0.01 * num_videos)]) / sum(avg_views_list) *
        100))
    print('top 10% videos occupy {0:.2f}% views'.format(
        sum(avg_views_list[:int(0.1 * num_videos)]) / sum(avg_views_list) *
        100))
    print('Gini coef: {0:.3f}'.format(gini_coef))

    spearman_degree = [
        embed_avg_indegree_dict[embed] for embed in range(num_videos)
    ]
    spearman_views = [
        embed_avg_view_dict[embed] for embed in range(num_videos)
    ]

    print(
        'Spearman correlation between views and indegree: {0:.4f}, pvalue: {1:.2f}'
        .format(*spearmanr(spearman_views, spearman_degree)))

    median_views = np.median(avg_views_list)
    top_views_90th = np.percentile(avg_views_list, 90)
    top_views_99th = np.percentile(avg_views_list, 99)
    ax2_xmin = ax2.get_xlim()[0]
    ax2_ymin = ax2.get_ylim()[0]

    ax2.plot((50, 50), (ax2_ymin, median_views),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 50), (median_views, median_views),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.49,
             0.45,
             'median views {0:,.0f}'.format(median_views),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    ax2.plot((90, 90), (ax2_ymin, top_views_90th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 90), (top_views_90th, top_views_90th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.88,
             0.75,
             '90th views {0:,.0f}'.format(top_views_90th),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    ax2.plot((99, 99), (ax2_ymin, top_views_99th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.plot((ax2_xmin, 99), (top_views_99th, top_views_99th),
             color='k',
             linestyle='--',
             zorder=30)
    ax2.text(0.91,
             0.95,
             '99th views {0:,.0f}'.format(top_views_99th),
             size=11,
             transform=ax2.transAxes,
             ha='right',
             va='bottom')

    # == == == == == == Part 7: Plot ax3 video uploading trend == == == == == == #
    x_axis = range(2009, 2018)
    x_labels = ["'09", "'10", "'11", "'12", "'13", "'14", "'15", "'16", "'17"]
    upload_mat = np.zeros((len(x_axis), 8))

    target_topics = [
        'Pop_music', 'Rock_music', 'Hip_hop_music', 'Independent_music',
        'Country_music', 'Electronic_music', 'Soul_music', 'Others'
    ]
    topic_labels = [
        'Pop', 'Rock', 'Hip hop', 'Independent', 'Country', 'Electronic',
        'Soul', 'Others'
    ]

    color_cycle_8 = ColorPalette.CC8

    data_loader.load_embed_content_dict()
    embed_title_dict = data_loader.embed_title_dict
    embed_uploadtime_dict = data_loader.embed_uploadtime_dict
    embed_genre_dict = data_loader.embed_genre_dict

    for embed in range(num_videos):
        upload_year = int(embed_uploadtime_dict[embed][:4])
        if 2009 <= upload_year <= 2017:
            year_idx = upload_year - 2009

            genres = embed_genre_dict[embed]
            if len(genres) == 0:
                # add one to "Others" genre
                upload_mat[year_idx, 7] += 1
            else:
                for genre in genres:
                    upload_mat[year_idx,
                               target_topics.index(genre)] += 1 / len(genres)

    print()
    print([
        '{0}: {1}'.format(topic, int(num))
        for topic, num in zip(target_topics, np.sum(upload_mat, axis=0))
    ])

    stackedBarPlot(ax=ax3,
                   data=upload_mat,
                   cols=color_cycle_8,
                   edgeCols=['#000000'] * 8,
                   xlabel='uploaded year',
                   ylabel='num of videos',
                   scale=False,
                   endGaps=True)

    ax3.tick_params(axis='both', which='major', labelsize=9)
    ax3.set_xticks(np.arange(len(x_axis)))
    ax3.set_xticklabels(x_labels)
    ax3.yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    ax3.legend([
        plt.Rectangle((0, 0), 1, 1, fc=c, ec='k', alpha=0.6)
        for c in color_cycle_8
    ],
               topic_labels,
               fontsize=9,
               frameon=False,
               handletextpad=0.2,
               columnspacing=0.3,
               ncol=4,
               bbox_to_anchor=(1, -0.12),
               bbox_transform=ax3.transAxes,
               fancybox=False,
               shadow=True)
    ax3.set_title('(c) VEVO videos uploading trend', fontsize=12)

    union_top_set = set(top_20_indegree_embeds).union(top_20_popular_embeds)
    print('\n>>> Size of the union set at cutoff 15:', len(union_top_set))
    print('{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'.
          format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views',
                 '-rank'))
    for embed in top_20_indegree_embeds:
        print(
            '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\'
            .format(
                embed_title_dict[embed].split(
                    ' - ', 1)[1].split('(')[0].split('ft')[0].strip(),
                embed_title_dict[embed].split(
                    ' - ',
                    1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format(
                        (datetime(2018, 11, 2) -
                         str2obj(embed_uploadtime_dict[embed])).days),
                '{0:,}'.format(int(embed_avg_indegree_dict[embed])),
                '{0:,}'.format(top_20_indegree_embeds.index(embed) + 1),
                '{0:,}'.format(int(embed_avg_view_dict[embed])),
                '{0:,}'.format(popular_ranked_embed_list.index(embed) + 1)))

    print('\n{0:>24} | {1:>17} | {2:>5} | {3:>8} | {4:>6} | {5:>10} | {6:>5}'.
          format('Video title', 'Artist', 'Age', 'Indegree', '-rank', 'Views',
                 '-rank'))
    for embed in top_20_popular_embeds:
        print(
            '{0:>24} & {1:>17} & {2:>5} & {3:>8} & {4:>6} & {5:>10} & {6:>5} \\\\'
            .format(
                embed_title_dict[embed].split(
                    ' - ', 1)[1].split('(')[0].split('ft')[0].strip(),
                embed_title_dict[embed].split(
                    ' - ',
                    1)[0].split('&')[0].split(',')[0].strip(), '{0:,}'.format(
                        (datetime(2018, 11, 2) -
                         str2obj(embed_uploadtime_dict[embed])).days),
                '{0:,}'.format(int(embed_avg_indegree_dict[embed])),
                '{0:,}'.format(indegree_ranked_embed_list.index(embed) + 1),
                '{0:,}'.format(int(embed_avg_view_dict[embed])),
                '{0:,}'.format(top_20_popular_embeds.index(embed) + 1)))

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/measure_basic_statistics.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'
    archive_dir = '../data/{0}_out'.format(app_name)
    entities = ['user', 'hashtag']
    rho = 0.5272

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]

    for ax_idx, entity in enumerate(entities):
        sample_datefile = open(os.path.join(
            archive_dir, '{0}_{1}_all.txt'.format(entity, app_name)),
                               'r',
                               encoding='utf-8')
        complete_datefile = open(os.path.join(
            archive_dir, 'complete_{0}_{1}.txt'.format(entity, app_name)),
                                 'r',
                                 encoding='utf-8')

        sample_entity_freq_dict = defaultdict(int)
        complete_entity_freq_dict = defaultdict(int)
        uni_random_entity_freq_dict = defaultdict(int)

        if entity == 'user':
            for line in sample_datefile:
                sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1
            for line in complete_datefile:
                complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1
                toss = np.random.random_sample()
                if toss <= rho:
                    uni_random_entity_freq_dict[line.rstrip().split(',')
                                                [1]] += 1
        else:
            for line in sample_datefile:
                for item in line.rstrip().split(',')[1:]:
                    sample_entity_freq_dict[item.lower()] += 1
            for line in complete_datefile:
                for item in line.rstrip().split(',')[1:]:
                    complete_entity_freq_dict[item.lower()] += 1
                toss = np.random.random_sample()
                if toss <= rho:
                    for item in line.rstrip().split(',')[1:]:
                        uni_random_entity_freq_dict[item.lower()] += 1

        sample_datefile.close()
        complete_datefile.close()

        # compute the powerlaw fit in the complete set
        complete_freq_list = list(complete_entity_freq_dict.values())
        complete_powerlaw_fit = Fit(complete_freq_list)
        complete_alpha = complete_powerlaw_fit.power_law.alpha
        complete_xmin = complete_powerlaw_fit.power_law.xmin
        print('{0} complete set alpha {1}, xmin {2}'.format(
            entity, complete_alpha, complete_xmin))
        plot_ccdf(complete_freq_list,
                  ax=axes[ax_idx],
                  color='k',
                  ls='-',
                  label='complete')

        # compute the powerlaw fit in the sample set
        # infer the number of missing entities
        sample_freq_list = list(sample_entity_freq_dict.values())
        sample_freq_counter = Counter(sample_freq_list)

        # we observe the frequency of entities appearing less than 100 times
        num_interest = 100
        sample_freq_list_top100 = [0] * num_interest
        for freq in range(1, num_interest + 1):
            sample_freq_list_top100[freq - 1] = sample_freq_counter[freq]

        inferred_num_missing = infer_missing_num(sample_freq_list_top100,
                                                 rho=rho,
                                                 m=num_interest)
        corrected_sample_freq_list = sample_freq_list + [
            0
        ] * inferred_num_missing
        sample_powerlaw_fit = Fit(corrected_sample_freq_list)
        sample_alpha = sample_powerlaw_fit.power_law.alpha
        sample_xmin = sample_powerlaw_fit.power_law.xmin
        print('{0} sample set alpha {1}, xmin {2}'.format(
            entity, sample_alpha, sample_xmin))
        plot_ccdf(corrected_sample_freq_list,
                  ax=axes[ax_idx],
                  color=blue,
                  ls='-',
                  label='sample')

        # compute the powerlaw fit in uniform random sample
        uni_random_num_missing = len(complete_entity_freq_dict) - len(
            uni_random_entity_freq_dict)
        uni_random_freq_list = list(uni_random_entity_freq_dict.values())
        uni_random_freq_list = uni_random_freq_list + [
            0
        ] * uni_random_num_missing
        uni_random_powerlaw_fit = Fit(uni_random_freq_list)
        uni_random_alpha = uni_random_powerlaw_fit.power_law.alpha
        uni_random_xmin = uni_random_powerlaw_fit.power_law.xmin
        print('{0} uniform random sampling alpha {1}, xmin {2}'.format(
            entity, uni_random_alpha, uni_random_xmin))
        plot_ccdf(uni_random_freq_list,
                  ax=axes[ax_idx],
                  color='k',
                  ls='--',
                  label='uniform random')

        print('inferred missing', inferred_num_missing)
        print('empirical missing',
              len(complete_entity_freq_dict) - len(sample_entity_freq_dict))
        print('uniform random missing', uni_random_num_missing)

        print('KS test (sample, uniform)')
        print(stats.ks_2samp(corrected_sample_freq_list, uni_random_freq_list))

        print('KS test (sample, complete)')
        print(stats.ks_2samp(corrected_sample_freq_list, complete_freq_list))

        print('KS test (uniform, complete)')
        print(stats.ks_2samp(uni_random_freq_list, complete_freq_list))

        axes[ax_idx].set_xscale('symlog')
        axes[ax_idx].set_yscale('log')
        axes[ax_idx].set_xlabel('frequency', fontsize=16)
        axes[ax_idx].tick_params(axis='both', which='major', labelsize=16)

    axes[0].set_xticks([0, 1, 100, 10000])
    axes[0].set_yticks([1, 0.01, 0.0001, 0.000001])
    axes[0].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[0].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='lower left')
    axes[0].set_title('(a) user posting', fontsize=18, pad=-3 * 72, y=1.0001)

    axes[1].set_xticks([0, 1, 100, 10000, 1000000])
    axes[1].set_yticks([1, 0.1, 0.001, 0.00001])
    axes[1].set_title('(b) hashtag', fontsize=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/entity_freq_dist.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
Пример #16
0
def plot_basics(data, data_inst, fig, units):
    from powerlaw import plot_pdf, Fit, pdf
    annotate_coord = (-.4, .95)
    ax1 = fig.add_subplot(n_graphs,n_data,data_inst)
    plot_pdf(data[data>0], ax=ax1, linear_bins=True, color='r', linewidth=.5)
    x, y = pdf(data, linear_bins=True)
    ind = y>0
    y = y[ind]
    x = x[:-1]
    x = x[ind]
    ax1.scatter(x, y, color='r', s=.5)
    plot_pdf(data[data>0], ax=ax1, color='b', linewidth=2)
    from pylab import setp
    setp( ax1.get_xticklabels(), visible=False)
    #ax1.set_xticks(ax1.get_xticks()[::2])
    ax1.set_yticks(ax1.get_yticks()[::2])
    locs,labels = yticks()
    #yticks(locs, map(lambda x: "%.0f" % x, log10(locs)))
    if data_inst==1:
        ax1.annotate("A", annotate_coord, xycoords="axes fraction", fontsize=14)

    
    from mpl_toolkits.axes_grid.inset_locator import inset_axes
    ax1in = inset_axes(ax1, width = "30%", height = "30%", loc=3)
    ax1in.hist(data, normed=True, color='b')
    ax1in.set_xticks([])
    ax1in.set_yticks([])

    
    ax2 = fig.add_subplot(n_graphs,n_data,n_data+data_inst, sharex=ax1)
    plot_pdf(data, ax=ax2, color='b', linewidth=2)
    fit = Fit(data, xmin=1, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle=':', color='g')
    p = fit.power_law.pdf()
    #ax2.set_ylim(min(p), max(p))
    ax2.set_xlim(ax1.get_xlim())
    
    fit = Fit(data, discrete=True)
    fit.power_law.plot_pdf(ax=ax2, linestyle='--', color='g')
    from pylab import setp
    setp( ax2.get_xticklabels(), visible=False)
    #ax2.set_xticks(ax2.get_xticks()[::2])
    if ax2.get_ylim()[1] >1:
        ax2.set_ylim(ax2.get_ylim()[0], 1)
    
    ax2.set_yticks(ax2.get_yticks()[::2])
    #locs,labels = yticks()
    #yticks(locs, map(lambda x: "%.0f" % x, log10(locs)))
    if data_inst==1:
       ax2.annotate("B", annotate_coord, xycoords="axes fraction", fontsize=14)        
       ax2.set_ylabel(r"$p(X)$")# (10^n)")
        
    ax3 = fig.add_subplot(n_graphs,n_data,n_data*2+data_inst)#, sharex=ax1)#, sharey=ax2)
    fit.power_law.plot_pdf(ax=ax3, linestyle='--', color='g')
    fit.exponential.plot_pdf(ax=ax3, linestyle='--', color='r')
    fit.plot_pdf(ax=ax3, color='b', linewidth=2)
    
    #p = fit.power_law.pdf()
    ax3.set_ylim(ax2.get_ylim())
    ax3.set_yticks(ax3.get_yticks()[::2])
    ax3.set_xlim(ax1.get_xlim())
    
    #locs,labels = yticks()
    #yticks(locs, map(lambda x: "%.0f" % x, log10(locs)))
    if data_inst==1:
        ax3.annotate("C", annotate_coord, xycoords="axes fraction", fontsize=14)

    #if ax2.get_xlim()!=ax3.get_xlim():
    #    zoom_effect01(ax2, ax3, ax3.get_xlim()[0], ax3.get_xlim()[1])
    ax3.set_xlabel(units)
Пример #17
0
def out_degree(adj):
    return np.count_nonzero(adj, axis=1)


funcs = [in_sum, out_sum, in_degree, out_degree]

for i in range(len(GRAPH_TYPES)):
    g_type = GRAPH_TYPES[i]
    g_type_label = GRAPH_TYPE_LABELS[i]
    adj = load_everything(g_type, version=BRAIN_VERSION)
    for j in range(len(funcs)):
        vals = funcs[j](adj)
        vals = vals[vals > 0]
        plot_ccdf(data=vals, ax=axs[i, j])
        results = Fit(vals)
        line = results.power_law.plot_ccdf(
            ax=axs[i, j],
            c="r",
            shift_by="original_data",
            linestyle="--",
            label="Power law",
        )
        results.lognormal.plot_ccdf(
            ax=axs[i, j],
            c="g",
            shift_by="original_data",
            linestyle="--",
            label="Lognormal",
        )
Пример #18
0
class _Analyzer(ABC):

    def __init__(self, settings):
        self.sc = settings.ctrl
        self.sd = settings.data
        self.sa = settings.anal

        # TODO: factor setting of these boolean flags into own method
        if self.sa.txmin_map:
            self._use_pct_file = any('PCT' in col_hdr for col_hdr
                                     in self.sa.txmin_map.values())

        self.rtn = Returns(settings)
        self.res = Results(settings)

        self._distros_to_compare = {'tpl': 'truncated_power_law',
                                    'exp': 'exponential',
                                    'lgn': 'lognormal'}

    # # # iteration state DEPENDENT (or aware) methods # # #

    def _log_curr_iter(self):
        # TODO: factor out repetitive log? (static: date, dynamic: group_label)
        gtyp, *date, tail = self.curr_iter_id
        grp_tail_log = (f"Analyzing {tail.name.upper()} tail of time series "
                        f"for {self.sd.grouping_type.title()} '{gtyp}' ")
        if bool(date):  # dynamic approach
            df = date[0]
            di = self.sa.get_dyn_lbd(df)
            # NOTE: di above is 1st date w/ price, not 1st date w/ return
        else:           # static approach
            di, df = self.sd.date_i, self.sd.date_f
        date_log = f"b/w [{di}, {df}]"
        print(grp_tail_log + date_log)

    @abstractmethod
    def _set_curr_input_array(self):
        # NOTE: storage posn into results_df (curr_df_pos) also set here
        pass

    def __get_xmin(self):
        rule, qnty = self.sa.xmin_rule, self.sa.xmin_qnty
        if rule in {"clauset", "manual"}:
            xmin = qnty  # ie. {None, user-input-ℝ} respectively
        elif rule == "percent":
            xmin = np.percentile(self.curr_signed_returns, qnty)
        elif rule == "std-dev":
            xmin = self.__calc_stdv_xmin(qnty)
        elif rule in {"file", "average"}:
            assert self.sa.use_dynamic,\
                ("static approach does NOT currently support passing "
                 "xmin data by file")  # TODO: add file support for -a static?
            grp, date, tail = self.curr_iter_id
            txmin = self.sa.txmin_map[tail]
            xmin = qnty.loc[date, f"{txmin} {grp}"]
            if isinstance(xmin, str) and xmin.endswith("%"):
                # b/c values containing '%' in xmins_df must be str
                percent = float(xmin[:-1])
            elif isinstance(xmin, (int, float)) and self._use_pct_file:
                if not (0 <= xmin <= 1):
                    raise TypeError("xmin percentile threshold value for "
                                    f"{self.iter_id_keys} is outside of 0-100")
                percent = xmin * 100
            else:
                pass  # numerical xmin data reaches this branch
            try:
                xmin = np.percentile(self.curr_signed_returns, percent)
            except NameError:
                xmin = float(xmin)
        else:
            raise AttributeError("this should never be reached!")
        return xmin

    def __calc_stdv_xmin(self, factor):
        mean = st.fmean(self.curr_returns_array)
        stdv = st.stdev(self.curr_returns_array)
        *_, tail = self.curr_iter_id
        assert mean < factor * stdv
        return abs(mean + tail.value * factor * stdv)  # tail.value ∈ {1, -1}

    def _fit_curr_data(self):
        data = self.curr_signed_returns
        data = data[np.nonzero(data)]  # only use non-zero elements to do Fit
        xmin = self.__get_xmin()
        self.curr_fit = Fit(data=data, xmin=xmin,
                            discrete=self.sa.fit_discretely)

    @staticmethod
    def gen_rmsf(mmt_func):     # rmsf: Returns Moments Statistics Functions
        def mf_wrapped(mmt_func, rtrn_vec):
            try:
                return mmt_func(rtrn_vec)
            except st.StatisticsError:
                return np.nan
        return (mmt_func,
                lambda rv: mf_wrapped(mmt_func, rv[rv>0]),
                lambda rv: mf_wrapped(mmt_func, rv[rv<0]))

    def __get_curr_rtrn_stats(self):
        # NOTE: functions in below list must match order in output_columns.yaml
        rs_fns = (len, lambda r: np.count_nonzero(r == 0), np.count_nonzero,
                  *_Analyzer.gen_rmsf(st.fmean),
                  *_Analyzer.gen_rmsf(st.stdev),
                  *_Analyzer.gen_rmsf(scipy.stats.skew),
                  *_Analyzer.gen_rmsf(scipy.stats.kurtosis),)
        rstats_fmap = {self.sd.rstats_collabs[i]: rs_fns[i] for i
                       in range(len(rs_fns))}
        return {rstat: rstats_fmap[rstat](self.curr_returns_array)
                for rstat in self.sd.rstats_collabs}

    def __get_curr_tail_stats(self):
        alpha, xmin, sigma = (getattr(self.curr_fit.power_law, prop)
                              for prop in ('alpha', 'xmin', 'sigma'))
        elm_in_fit = self.curr_signed_returns >= xmin
        fitted_vec = self.curr_signed_returns[elm_in_fit]
        xmax = max(fitted_vec)
        xmean = fitted_vec.mean()
        xstdv = fitted_vec.std()
        abs_len = len(fitted_vec)
        if self.sa.run_ks_test is True:
            # TODO: try compute ks_pv using MATLAB engine & module, and time
            ks_pv, _ = plpva(self.curr_signed_returns, xmin, 'reps',
                             self.sa.ks_iter, 'silent')
        locs = locals()
        return {('tail-statistics', stat): locs.get(stat) for st_type, stat
                in self.sd.tstats_collabs if stat in locs}

    def __get_curr_logl_stats(self):
        # compute (R, p)-pairs (x3) using powerlaw.Fit.distribution_compare
        logl_stats = {key:
                      {stat: val for stat, val in
                       zip(('R', 'p'),
                           self.curr_fit.distribution_compare(
                               'power_law', distro,
                               normalized_ratio=True))}
                      for key, distro in self._distros_to_compare.items()}
        return {('log-likelihoods', f"{dist}_{st}"): val for dist, stats
                in logl_stats.items() for st, val in stats.items()}

    def __get_curr_plfit_stats(self):
        tail_stats = self.__get_curr_tail_stats()
        logl_stats = (self.__get_curr_logl_stats()
                      if self.sa.compare_distros else {})
        return {**tail_stats, **logl_stats}

    def __get_calcd_substats_map(self, sstype):
        idx, col = self.curr_df_pos  # type(idx)==str; type(col)==tuple

        if sstype == 'plfit':
            stcalc_fn = self.__get_curr_plfit_stats
            top_grp = col if self.sa.use_dynamic else (col,)
            need_ss = self.sa.analyze_tails
        elif sstype == 'returns':
            stcalc_fn = self.__get_curr_rtrn_stats
            top_grp = ((col,) if not self.sa.analyze_tails else
                       (col[0],) if self.sa.use_dynamic else
                       ())
            # NOTE: hasnans check below on (<col>, 'rtrn-stats') Rm's redundant
            # calc only works for 1-proc b/c multiproc only updts res_df at end
            rstat_uncalcd = self.res.df.loc[idx, top_grp + ('returns-statistics',)].hasnans
            need_ss = self.sa.calc_rtrn_stats and rstat_uncalcd

        return ({top_grp + tuple(ss_key): ss_val
                for ss_key, ss_val in stcalc_fn().items()}
                if need_ss else {})

    def _gset_curr_partial_results(self, action):
        fstats_map = self.__get_calcd_substats_map('plfit')
        rstats_map = self.__get_calcd_substats_map('returns')

        # TODO: use np.ndarray instead of pd.Series (wasteful) --> order later
        curr_part_res_series = pd.Series({**fstats_map, **rstats_map})

        idx, _ = self.curr_df_pos
        if action == 'store':
            self.res.df.loc[idx].update(curr_part_res_series)
            # TODO: consider using pd.DataFrame.replace(, inplace=True) instead
            # TODO: can also order stats results first, then assign to DF row
        elif action == 'return':
            return idx, curr_part_res_series

    # # # orchestration / driver methods # # #

    # convenience wrapper to keep things tidy
    def _run_curr_iter_fitting(self):
        self._log_curr_iter()
        self._set_curr_input_array()
        self._fit_curr_data()

    # runs analysis on data ID'd by the next iteration of the stateful iterator
    def _analyze_next(self):  # TODO: combine _analyze_next & _analyze_iter??
        self.curr_iter_id = next(self.iter_id_keys)  # set in subclasses
        self._run_curr_iter_fitting()
        self._gset_curr_partial_results('store')

    # runs analysis from start to finish, in 1-process + single-threaded mode
    def analyze_sequential(self):
        while True:
            try:
                self._analyze_next()
            except StopIteration:
                break

    # runs analysis for one iteration of analysis given arbitrary iter_id
    def _analyze_iter(self, iter_id):  # NOTE: use this to resume computation
        print(f"### DEBUG: PID {getpid()} analyzing iter {iter_id}", file=sys.stderr)
        self.curr_iter_id = iter_id
        self._run_curr_iter_fitting()
        return self._gset_curr_partial_results('return')

    # runs analysis in multiprocessing mode
    def analyze_multiproc(self):
        # TODO: https://stackoverflow.com/a/52596590/5437918 (use shared DBDFs)
        iter_id_keys = tuple(self.iter_id_keys)

        # TODO: look into Queue & Pipe for sharing data
        with Pool(processes=self.sc.nproc) as pool:
            # TODO checkout .map alternatives: .imap, .map_async, etc.
            restup_ls = [restup for restup in  # TODO: optimize chunksize below
                         pool.map(self._analyze_iter, iter_id_keys)]

        # TODO: update res_df more efficiently, ex. pd.df.replace(), np.ndarray
        for restup in restup_ls:
            idx, res = restup  # if use '+' NOTE that DFs init'd w/ NaNs
            self.res.df.loc[idx].update(res)

    # top-level convenience method that autodetects how to run tail analysis
    def analyze(self):
        nproc = self.sc.nproc
        # TODO: add other conditions for analyze_sequential (ex. -a static)
        if nproc == 1:
            self.analyze_sequential()
        elif nproc > 1:
            self.analyze_multiproc()
        else:  # if 0 or negative number of processors got through to here
            raise TypeError(f'Cannot perform analysis with {nproc} processes')

    def get_resdf(self):
        # TODO: final clean ups of DF for presentation:
        #       - use .title() on all index labels, then write to file
        self.res.prettify_df()
        return self.res.df
Пример #19
0
 def _fit_curr_data(self):
     data = self.curr_signed_returns
     data = data[np.nonzero(data)]  # only use non-zero elements to do Fit
     xmin = self.__get_xmin()
     self.curr_fit = Fit(data=data, xmin=xmin,
                         discrete=self.sa.fit_discretely)
Пример #20
0
def main():
    """ Computes various graph statistics """

    #Load subgraph here
    #G = nx.read_adjlist("data/sub_graph_networkx_graph")
    #G = G.to_directed()

    #Load graph
    #name = 'data/internal-references-pdftotext.json.gz'
    name = '../../data/internal-references-pdftotext.json.gz'
    q = ia.loaddata(fname=name)
    G = ia.makegraph(q)

    #basic stats
    N_nodes, N_edges = G.number_of_nodes(), G.number_of_edges()

    #Degree
    t1 = time.time()
    in_deg = [d for n, d in G.in_degree()]
    out_deg = [d for n, d in G.out_degree()]
    np.savetxt('../../data/in_degree.txt', in_deg)
    np.savetxt('../../data/out_degree.txt', out_deg)
    mean_k = 2 * np.mean(in_deg)
    t2 = time.time()
    print('degree took ' + str((t2 - t1) / 60.0) + ' mins')

    #Find powerlaw fits
    fit_in, fit_out = Fit(in_deg, xmin=0), Fit(out_deg, xmin=0)
    alpha_in = np.round(fit_in.power_law.alpha, 2)
    xmin_in = np.round(fit_in.power_law.xmin, 2)
    alpha_out = np.round(fit_out.power_law.alpha, 2)
    xmin_out = np.round(fit_out.power_law.xmin, 2)
    print('For power law fitting in-degree: x_min = ' + str(xmin_in))
    print('For power law fitting out-degree: x_min = ' + str(xmin_out) + '\n')

    #Clustering coeff
    t1 = time.time()
    cs = list(nx.clustering(G).values())
    np.savetxt('../../data/clustering_c.txt', cs)
    mean_C = np.round(np.mean(cs), 2)
    t2 = time.time()
    print('cluster coeff took ' + str((t2 - t1) / 60.0) + ' mins')

    #Size-biggest
    t1 = time.time()
    comps = nx.weakly_connected_components(G)
    biggest = max(comps, key=len)
    G_cc = G.subgraph(biggest)
    size_WCC = 1.0 * G_cc.number_of_nodes()
    fraction_WCC = np.round(size_WCC / N_nodes, 2)

    #Num isolated
    num_isolated = 0
    comps = nx.weakly_connected_components(G)
    for cc in comps:
        if len(cc) == 1:
            num_isolated += 1
    fraction_isolated = np.round((1.0 * num_isolated) / N_nodes, 2)

    t2 = time.time()
    print('cluster size dist ' + str((t2 - t1) / 60.0) + ' mins')

    #results
    stats = [
        N_nodes, N_edges, mean_k, alpha_in, alpha_out, mean_C, fraction_WCC,
        fraction_isolated
    ]
    print(stats)

    #Stuff for tables

    OpenArXiv = ['openArXiv']
    OpenArXiv.extend(map(lambda n: '{:.3f}'.format(n), stats))

    # Automatically make table!
    datenow = str(datetime.now()).split()[0]
    with open('graph-stats-{}.tex'.format(datenow), 'w') as fout:
        fout.write(make_latex_table([Header, OpenArXiv, WoS, CiteSeer, ArXiv]))

    #### MAKE FIGURE
    tick_size = 20
    axis_size = 30
    label_size = 28
    label_y_position = 1.10
    inset_size = 18
    plt.figure(figsize=(20, 5))

    #Histogram in-degree
    n_bins = 30
    ax1 = plt.subplot(131)
    plt.hist(in_deg, alpha=0.75, bins=n_bins)
    #plt.hist(out_deg, alpha=0.75,bins=n_bins)
    plt.xlabel('$k_{in}$', fontsize=axis_size)
    plt.xticks(fontsize=tick_size)
    plt.yticks(fontsize=tick_size)
    plt.rc('font', size=15)
    ax1.set_xscale('log')
    ax1.set_yscale('log')
    ax1.spines["top"].set_visible(False)
    ax1.spines["right"].set_visible(False)
    ax1.text(-0.025,
             label_y_position,
             'a',
             transform=ax1.transAxes,
             fontsize=label_size,
             fontweight='bold',
             va='top',
             ha='right')
    ax1.text(0.9,
             0.55,
             '',
             transform=ax1.transAxes,
             fontsize=inset_size,
             va='top',
             ha='right')

    #Histogram out-degree
    ax2 = plt.subplot(132)
    plt.hist(out_deg, alpha=0.75, bins=n_bins)
    plt.xlabel('$k_{out}$', fontsize=axis_size)
    plt.xticks(fontsize=tick_size)
    plt.yticks(fontsize=tick_size)
    plt.rc('font', size=15)
    ax2.set_xscale('log')
    ax2.set_yscale('log')
    ax2.spines["top"].set_visible(False)
    ax2.spines["right"].set_visible(False)
    ax2.text(-0.025,
             label_y_position,
             'b',
             transform=ax2.transAxes,
             fontsize=label_size,
             fontweight='bold',
             va='top',
             ha='right')
    ax2.text(0.9,
             0.55,
             '',
             transform=ax1.transAxes,
             fontsize=inset_size,
             va='top',
             ha='right')

    #Histogram clustering coefficients
    ax3 = plt.subplot(133)
    plt.hist(cs, alpha=0.75, bins=n_bins)
    ax3.set_xscale('log')
    ax3.set_yscale('log')
    plt.xlabel('$C$', fontsize=axis_size)
    plt.xticks(fontsize=tick_size)
    plt.yticks(fontsize=tick_size)
    plt.rc('font', size=15)
    ax3.spines["top"].set_visible(False)
    ax3.spines["right"].set_visible(False)
    ax3.text(-0.025,
             label_y_position,
             'c',
             transform=ax3.transAxes,
             fontsize=label_size,
             fontweight='bold',
             va='top',
             ha='right')
    ax3.text(0.9,
             0.55,
             '',
             transform=ax2.transAxes,
             fontsize=inset_size,
             va='top',
             ha='right')
    plt.tight_layout()
    if not os.path.exists('figures'):
        os.makedirs('figures')
    plt.savefig('figures/histograms_onerow-{}.pdf'.format(datenow))
Пример #21
0
 def _get_fit_obj(self, data, xmin=None):
     # NOTE: only keep/use non-zero elements
     data = np.nonzero(data)  # TODO: confirm data is always of np.ndarray
     discrete = False if self.ds.data_nature == 'continuous' else False
     xmin = self.__get_xmin(xmin=xmin, data=data)
     return Fit(data, discrete=discrete, xmin=xmin)