Exemplo n.º 1
0
def main():
    timer = Timer()
    timer.start()

    cc4 = ColorPalette.CC4

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    start_idx = 21000
    end_idx = 25500
    timestamp_list = []
    track_list = []

    with open('rate_limit_2015-09-08.txt', 'r') as fin:
        for line in fin:
            rate_json = json.loads(line.rstrip())
            track = rate_json['limit']['track']
            track_list.append(track)
            timestamp = datetime.utcfromtimestamp(
                (int(rate_json['limit']['timestamp_ms'][:-3])))
            timestamp_list.append(timestamp)

    axes[0].scatter(timestamp_list[start_idx:end_idx],
                    track_list[start_idx:end_idx],
                    c='k',
                    s=0.4)
    axes[0].set_xlim([timestamp_list[start_idx], timestamp_list[end_idx]])
    axes[0].xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    axes[0].set_xticks(axes[0].get_xticks()[::2])
    axes[0].set_xlabel('Sep 08, 2015', fontsize=16)
    axes[0].set_ylabel('value', fontsize=16)
    axes[0].yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001)

    print('start timestamp', timestamp_list[start_idx])
    print('end timestamp', timestamp_list[end_idx])
    split_track_lst, split_ts_lst = map_ratemsg(
        track_list[start_idx:end_idx], timestamp_list[start_idx:end_idx])
    total_miss = 0
    for track_lst, ts_lst, color in zip(split_track_lst, split_ts_lst, cc4):
        axes[1].scatter(ts_lst, track_lst, c=color, s=0.4)
        total_miss += (track_list[-1] - track_list[0])
    print('{0} tweets are missing'.format(total_miss))
    axes[1].set_xlim([timestamp_list[start_idx], timestamp_list[end_idx]])
    axes[1].xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    axes[1].set_xticks(axes[1].get_xticks()[::2])
    axes[1].set_xlabel('Sep 08, 2015', fontsize=16)
    axes[1].yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/SI_ratemsg_coloring.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    cc4 = ColorPalette.CC4
    blue = cc4[0]

    app_name = 'cyberbullying'
    rho = 0.5272
    entity = 'user'

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    print('for entity: {0}'.format(entity))
    sample_entity_freq_dict = defaultdict(int)
    with open('../data/{1}_out/{0}_{1}_all.txt'.format(entity, app_name), 'r') as sample_datefile:
        for line in sample_datefile:
            sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1

    complete_entity_freq_dict = defaultdict(int)
    with open('../data/{1}_out/complete_{0}_{1}.txt'.format(entity, app_name), 'r') as complete_datefile:
        for line in complete_datefile:
            complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1

    complete_to_sample_freq_dict = defaultdict(list)
    sample_to_complete_freq_dict = defaultdict(list)

    for item, complete_vol in complete_entity_freq_dict.items():
        if item in sample_entity_freq_dict:
            complete_to_sample_freq_dict[complete_vol].append(sample_entity_freq_dict[item])
        else:
            complete_to_sample_freq_dict[complete_vol].append(0)

    for item, sample_vol in sample_entity_freq_dict.items():
        sample_to_complete_freq_dict[sample_vol].append(complete_entity_freq_dict[item])

    for item in set(complete_entity_freq_dict.keys()) - set(sample_entity_freq_dict.keys()):
        sample_to_complete_freq_dict[0].append(complete_entity_freq_dict[item])

    ax1_x_axis = range(1, 101)

    ax1_y_axis = []
    empirical_mean_list = []
    expected_mean_list = []
    for num_sample in ax1_x_axis:
        # compute sample to complete
        empirical_cnt_dist = sample_to_complete_freq_dict[num_sample]
        neg_binomial_cnt_dist = []
        for x in range(num_sample, max(30, 3 * num_sample + 1)):
            neg_binomial_cnt_dist.extend([x] * int(negative_binomial(x, num_sample, rho) * len(empirical_cnt_dist)))
        ks_test = stats.ks_2samp(empirical_cnt_dist, neg_binomial_cnt_dist)
        empirical_mean = sum(empirical_cnt_dist) / len(empirical_cnt_dist)
        empirical_mean_list.append(empirical_mean)
        expected_mean = sum(neg_binomial_cnt_dist) / len(neg_binomial_cnt_dist)
        expected_mean_list.append(expected_mean)
        print('num_sample: {0}, number of Bernoulli trials: {1}, d_statistic: {2:.4f}, p: {3:.4f}, expected mean: {4:.2f}, empirical mean: {5:.2f}'
              .format(num_sample, len(empirical_cnt_dist), ks_test[0], ks_test[1], expected_mean, empirical_mean))
        ax1_y_axis.append(ks_test[0])

    axes[0].plot(ax1_x_axis, ax1_y_axis, c='k', lw=1.5, ls='-')

    axes[0].set_xlabel(r'sample frequency $n_s$', fontsize=16)
    axes[0].set_ylabel('D-statistic', fontsize=16)
    axes[0].set_xlim([-2, 102])
    axes[0].set_xticks([0, 25, 50, 75, 100])
    axes[0].set_ylim([0, 0.17])
    axes[0].yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{0:.2f}'.format(x)))
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', fontsize=18, pad=-3*72, y=1.0001)

    # show an example
    num_sample = np.argmin(ax1_y_axis) + 1

    axes[0].scatter(num_sample, ax1_y_axis[num_sample - 1], s=40, c=blue, zorder=30)
    axes[0].set_yticks([0, ax1_y_axis[num_sample - 1], 0.05, 0.1, 0.15])
    axes[0].plot([axes[0].get_xlim()[0], num_sample], [ax1_y_axis[num_sample - 1], ax1_y_axis[num_sample - 1]], color=blue, ls='--', lw=1)
    axes[0].plot([num_sample, num_sample], [axes[0].get_ylim()[0], ax1_y_axis[num_sample - 1]], color=blue, ls='--', lw=1)

    # plot sample to complete
    ax2_x_axis = range(num_sample, max(30, 3 * num_sample + 1))
    num_items = len(sample_to_complete_freq_dict[num_sample])
    sample_to_complete_cnt = Counter(sample_to_complete_freq_dict[num_sample])
    ax2_y_axis = [sample_to_complete_cnt[x] / num_items for x in ax2_x_axis]
    ax2_neg_binomial_axis = [negative_binomial(x, num_sample, rho) for x in ax2_x_axis]

    axes[1].plot(ax2_x_axis, ax2_y_axis, c=blue, lw=1.5, ls='-', marker='o', zorder=20, label='empirical')
    axes[1].plot(ax2_x_axis, ax2_neg_binomial_axis, c='k', lw=1.5, ls='-', marker='x', zorder=10, label='negative binomial')

    axes[1].set_xlabel(r'complete frequency $n_c$', fontsize=16)
    axes[1].set_ylabel(r'Pr($n_c$|$n_s$={0})'.format(num_sample), fontsize=16)
    axes[1].set_xticks([num_sample, 2 * num_sample, 3 * num_sample])
    axes[1].set_ylim([-0.005, 0.15])
    axes[1].set_yticks([0, 0.05, 0.1])
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='upper left')
    axes[1].set_title('(b)', fontsize=18, pad=-3*72, y=1.0001)

    axes[1].plot([empirical_mean_list[num_sample - 1], empirical_mean_list[num_sample - 1]], [axes[1].get_ylim()[0], 0.1], color=blue, ls='--', lw=1)
    axes[1].plot([expected_mean_list[num_sample - 1], expected_mean_list[num_sample - 1]], [axes[1].get_ylim()[0], 0.1], color='k', ls='--', lw=1)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/entity_negative_binomial.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'

    sample_cascade_size = {}
    sample_inter_arrival_time = []
    sample_cascade_influence = {}
    sample_cascade_influence_10m = defaultdict(int)
    sample_cascade_influence_1h = defaultdict(int)
    with open('../data/{0}_out/sample_retweet_{0}.txt'.format(app_name),
              'r') as fin:
        for line in fin:
            root_tweet, cascades = line.rstrip().split(':')
            cascades = cascades.split(',')
            root_tweet = root_tweet.split('-')[0]
            retweets = [x.split('-')[0] for x in cascades]
            influences = [int(x.split('-')[1]) for x in cascades]
            sample_cascade_size[root_tweet] = len(retweets)
            sample_cascade_influence[root_tweet] = sum(influences)
            root_timestamp = melt_snowflake(root_tweet)[0] / 1000
            retweet_timestamp_list = [root_timestamp]

            for i in range(len(retweets)):
                retweet_time = melt_snowflake(retweets[i])[0] / 1000
                relative_retweet_time = retweet_time - root_timestamp
                retweet_timestamp_list.append(
                    melt_snowflake(retweets[i])[0] / 1000)
                if relative_retweet_time < 10 * 60:
                    sample_cascade_influence_10m[root_tweet] += influences[i]
                if relative_retweet_time < 60 * 60:
                    sample_cascade_influence_1h[root_tweet] += influences[i]

            for i in range(len(retweet_timestamp_list) - 1):
                sample_inter_arrival_time.append(retweet_timestamp_list[i +
                                                                        1] -
                                                 retweet_timestamp_list[i])

    complete_cascade_size = {}
    complete_inter_arrival_time = []
    complete_cascade_influence = {}
    complete_cascade_influence_10m = defaultdict(int)
    complete_cascade_influence_1h = defaultdict(int)
    with open('../data/{0}_out/complete_retweet_{0}.txt'.format(app_name),
              'r') as fin:
        for line in fin:
            root_tweet, cascades = line.rstrip().split(':')
            cascades = cascades.split(',')
            root_tweet = root_tweet.split('-')[0]
            retweets = [x.split('-')[0] for x in cascades]
            complete_cascade_size[root_tweet] = len(retweets)
            if len(retweets) >= 50:
                influences = [int(x.split('-')[1]) for x in cascades]
                complete_cascade_influence[root_tweet] = sum(influences)
                root_timestamp = melt_snowflake(root_tweet)[0] / 1000
                retweet_timestamp_list = [root_timestamp]

                for i in range(len(retweets)):
                    retweet_time = melt_snowflake(retweets[i])[0] / 1000
                    relative_retweet_time = retweet_time - root_timestamp
                    retweet_timestamp_list.append(
                        melt_snowflake(retweets[i])[0] / 1000)
                    if relative_retweet_time < 10 * 60:
                        complete_cascade_influence_10m[
                            root_tweet] += influences[i]
                    if relative_retweet_time < 60 * 60:
                        complete_cascade_influence_1h[
                            root_tweet] += influences[i]

                for i in range(len(retweet_timestamp_list) - 1):
                    complete_inter_arrival_time.append(
                        retweet_timestamp_list[i + 1] -
                        retweet_timestamp_list[i])

    print('number of cascades in the complete set', len(complete_cascade_size))
    print('number of cascades in the sample set', len(sample_cascade_size))

    print('mean complete size', np.mean(list(complete_cascade_size.values())))
    print('mean sample size', np.mean(list(sample_cascade_size.values())))

    print('complete #cascades (≥50 retweets)',
          sum([1 for x in list(complete_cascade_size.values()) if x >= 50]))
    print('sample #cascades (≥50 retweets)',
          sum([1 for x in list(sample_cascade_size.values()) if x >= 50]))

    num_complete_cascades_in_sample = 0
    complete_cascades_in_sample_size_list = []
    num_complete_cascades_in_sample_50 = 0
    for root_tweet in sample_cascade_size:
        if sample_cascade_size[root_tweet] == complete_cascade_size[
                root_tweet]:
            num_complete_cascades_in_sample += 1
            complete_cascades_in_sample_size_list.append(
                complete_cascade_size[root_tweet])
            if complete_cascade_size[root_tweet] >= 50:
                num_complete_cascades_in_sample_50 += 1
    print('number of complete cascades in the sample set',
          num_complete_cascades_in_sample)
    print('number of complete cascades (>50 retweets) in the sample set',
          num_complete_cascades_in_sample_50)
    print('max: {0}, mean: {1}'.format(
        max(complete_cascades_in_sample_size_list),
        np.mean(complete_cascades_in_sample_size_list)))

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]
    red = cc4[3]

    sample_median = np.median(sample_inter_arrival_time)
    complete_median = np.median(complete_inter_arrival_time)

    plot_ccdf(sample_inter_arrival_time,
              ax=axes[0],
              color=blue,
              ls='-',
              label='sample')
    plot_ccdf(complete_inter_arrival_time,
              ax=axes[0],
              color='k',
              ls='-',
              label='complete')

    axes[0].plot([sample_median, sample_median], [0, 1],
                 color=blue,
                 ls='--',
                 lw=1)
    axes[0].plot([complete_median, complete_median], [0, 1],
                 color='k',
                 ls='--',
                 lw=1)

    print('\ninter_arrival_time sample median', sample_median)
    print('inter_arrival_time complete median', complete_median)

    axes[0].set_xscale('symlog')
    axes[0].set_xticks([0, 1, 100, 10000, 1000000])
    axes[0].set_yscale('linear')
    axes[0].set_xlabel('inter-arrival time (sec)', fontsize=16)
    axes[0].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[0].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='upper right')
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', fontsize=18, pad=-3 * 72, y=1.0001)

    influence_list = []
    influence_list_10m = []
    influence_list_1h = []
    for root_tweet in sample_cascade_size:
        if complete_cascade_size[root_tweet] >= 50:
            if complete_cascade_influence[root_tweet] > 0:
                influence_list.append(sample_cascade_influence[root_tweet] /
                                      complete_cascade_influence[root_tweet])
            if complete_cascade_influence_10m[root_tweet] > 0:
                influence_list_10m.append(
                    sample_cascade_influence_10m[root_tweet] /
                    complete_cascade_influence_10m[root_tweet])
            if complete_cascade_influence_1h[root_tweet] > 0:
                influence_list_1h.append(
                    sample_cascade_influence_1h[root_tweet] /
                    complete_cascade_influence_1h[root_tweet])

    plot_ccdf(influence_list_10m, ax=axes[1], color=red, ls='-', label='10m')
    plot_ccdf(influence_list_1h, ax=axes[1], color=blue, ls='-', label='1h')
    plot_ccdf(influence_list, ax=axes[1], color='k', ls='-', label='14d')

    print('influence_list median', np.median(influence_list))
    print('influence_list_1h median', np.median(influence_list_1h))
    print('influence_list_10m median', np.median(influence_list_10m))

    print('influence_list 0.25', percentileofscore(influence_list, 0.25))
    print('influence_list 0.25', percentileofscore(influence_list_1h, 0.25))
    print('influence_list 0.25', percentileofscore(influence_list_10m, 0.25))

    print('influence_list 0.75', percentileofscore(influence_list, 0.75))
    print('influence_list 0.75', percentileofscore(influence_list_1h, 0.75))
    print('influence_list 0.75', percentileofscore(influence_list_10m, 0.75))

    axes[1].set_xscale('linear')
    axes[1].set_yscale('linear')
    axes[1].set_xlabel('relative potential reach', fontsize=16)
    # axes[1].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[1].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='upper right')
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].set_title('(b)', fontsize=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/cascades_measures.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
Exemplo n.º 4
0
def main():
    timer = Timer()
    timer.start()

    cc4 = ColorPalette.CC4
    blue = cc4[0]

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    timestamp_list = []
    sec_count_dict = defaultdict(int)
    ms_list = []

    with open('rate_limit_2015-09-08.txt', 'r') as fin:
        for line in fin:
            rate_json = json.loads(line.rstrip())
            ms_list.append(int(rate_json['limit']['timestamp_ms'][-3:]))
            timestamp = datetime.utcfromtimestamp(
                (int(rate_json['limit']['timestamp_ms']) - 666) // 1000)
            timestamp_list.append(timestamp)
            sec_count_dict[timestamp] += 1

    print('{0:.2f}% rate limit messages come from millisecond 700 to 1000'.
          format(len([x for x in ms_list if x >= 700]) / len(ms_list) * 100))

    sns.distplot(ms_list,
                 bins=200,
                 color=blue,
                 ax=axes[0],
                 kde_kws={
                     'shade': False,
                     'linewidth': 1.5,
                     'color': 'k'
                 })
    axes[0].set_xticks([0, 250, 500, 750, 1000])
    axes[0].set_xlim([-50, 1050])
    axes[0].set_xlabel('millisecond', fontsize=16)
    axes[0].set_ylabel('density', fontsize=16)
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001)

    sec_count_stats = Counter(sec_count_dict.values())
    x_axis = sorted(sec_count_stats.keys())
    axes[1].bar(x_axis, [sec_count_stats[x] for x in x_axis],
                facecolor=blue,
                edgecolor='k',
                width=0.7)
    axes[1].set_xticks([1, 2, 3, 4])
    axes[1].set_xlim([0, 5])
    axes[1].set_xlabel('#rate limit messages per second', fontsize=16)
    axes[1].set_ylabel('frequency', fontsize=16)
    axes[1].yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/SI_ratemsg_dist.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'
    archive_dir = '../data/{0}_out'.format(app_name)
    entities = ['user', 'hashtag']
    rho = 0.5272

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]

    for ax_idx, entity in enumerate(entities):
        sample_datefile = open(os.path.join(
            archive_dir, '{0}_{1}_all.txt'.format(entity, app_name)),
                               'r',
                               encoding='utf-8')
        complete_datefile = open(os.path.join(
            archive_dir, 'complete_{0}_{1}.txt'.format(entity, app_name)),
                                 'r',
                                 encoding='utf-8')

        sample_entity_freq_dict = defaultdict(int)
        complete_entity_freq_dict = defaultdict(int)
        uni_random_entity_freq_dict = defaultdict(int)

        if entity == 'user':
            for line in sample_datefile:
                sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1
            for line in complete_datefile:
                complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1
                toss = np.random.random_sample()
                if toss <= rho:
                    uni_random_entity_freq_dict[line.rstrip().split(',')
                                                [1]] += 1
        else:
            for line in sample_datefile:
                for item in line.rstrip().split(',')[1:]:
                    sample_entity_freq_dict[item.lower()] += 1
            for line in complete_datefile:
                for item in line.rstrip().split(',')[1:]:
                    complete_entity_freq_dict[item.lower()] += 1
                toss = np.random.random_sample()
                if toss <= rho:
                    for item in line.rstrip().split(',')[1:]:
                        uni_random_entity_freq_dict[item.lower()] += 1

        sample_datefile.close()
        complete_datefile.close()

        # compute the powerlaw fit in the complete set
        complete_freq_list = list(complete_entity_freq_dict.values())
        complete_powerlaw_fit = Fit(complete_freq_list)
        complete_alpha = complete_powerlaw_fit.power_law.alpha
        complete_xmin = complete_powerlaw_fit.power_law.xmin
        print('{0} complete set alpha {1}, xmin {2}'.format(
            entity, complete_alpha, complete_xmin))
        plot_ccdf(complete_freq_list,
                  ax=axes[ax_idx],
                  color='k',
                  ls='-',
                  label='complete')

        # compute the powerlaw fit in the sample set
        # infer the number of missing entities
        sample_freq_list = list(sample_entity_freq_dict.values())
        sample_freq_counter = Counter(sample_freq_list)

        # we observe the frequency of entities appearing less than 100 times
        num_interest = 100
        sample_freq_list_top100 = [0] * num_interest
        for freq in range(1, num_interest + 1):
            sample_freq_list_top100[freq - 1] = sample_freq_counter[freq]

        inferred_num_missing = infer_missing_num(sample_freq_list_top100,
                                                 rho=rho,
                                                 m=num_interest)
        corrected_sample_freq_list = sample_freq_list + [
            0
        ] * inferred_num_missing
        sample_powerlaw_fit = Fit(corrected_sample_freq_list)
        sample_alpha = sample_powerlaw_fit.power_law.alpha
        sample_xmin = sample_powerlaw_fit.power_law.xmin
        print('{0} sample set alpha {1}, xmin {2}'.format(
            entity, sample_alpha, sample_xmin))
        plot_ccdf(corrected_sample_freq_list,
                  ax=axes[ax_idx],
                  color=blue,
                  ls='-',
                  label='sample')

        # compute the powerlaw fit in uniform random sample
        uni_random_num_missing = len(complete_entity_freq_dict) - len(
            uni_random_entity_freq_dict)
        uni_random_freq_list = list(uni_random_entity_freq_dict.values())
        uni_random_freq_list = uni_random_freq_list + [
            0
        ] * uni_random_num_missing
        uni_random_powerlaw_fit = Fit(uni_random_freq_list)
        uni_random_alpha = uni_random_powerlaw_fit.power_law.alpha
        uni_random_xmin = uni_random_powerlaw_fit.power_law.xmin
        print('{0} uniform random sampling alpha {1}, xmin {2}'.format(
            entity, uni_random_alpha, uni_random_xmin))
        plot_ccdf(uni_random_freq_list,
                  ax=axes[ax_idx],
                  color='k',
                  ls='--',
                  label='uniform random')

        print('inferred missing', inferred_num_missing)
        print('empirical missing',
              len(complete_entity_freq_dict) - len(sample_entity_freq_dict))
        print('uniform random missing', uni_random_num_missing)

        print('KS test (sample, uniform)')
        print(stats.ks_2samp(corrected_sample_freq_list, uni_random_freq_list))

        print('KS test (sample, complete)')
        print(stats.ks_2samp(corrected_sample_freq_list, complete_freq_list))

        print('KS test (uniform, complete)')
        print(stats.ks_2samp(uni_random_freq_list, complete_freq_list))

        axes[ax_idx].set_xscale('symlog')
        axes[ax_idx].set_yscale('log')
        axes[ax_idx].set_xlabel('frequency', fontsize=16)
        axes[ax_idx].tick_params(axis='both', which='major', labelsize=16)

    axes[0].set_xticks([0, 1, 100, 10000])
    axes[0].set_yticks([1, 0.01, 0.0001, 0.000001])
    axes[0].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[0].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='lower left')
    axes[0].set_title('(a) user posting', fontsize=18, pad=-3 * 72, y=1.0001)

    axes[1].set_xticks([0, 1, 100, 10000, 1000000])
    axes[1].set_yticks([1, 0.1, 0.001, 0.00001])
    axes[1].set_title('(b) hashtag', fontsize=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/entity_freq_dist.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_name = 'youtube'
    archive_dir = './{0}_out'.format(app_name)
    lang_list = ['ja+ko', 'others']

    cc4 = ColorPalette.CC4
    red = cc4[3]

    num_days = 14
    hours_in_day = 24

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    sample_tid_set = set()
    sample_ts_datefile = os.path.join(archive_dir,
                                      'ts_{0}_all.txt'.format(app_name))
    with open(sample_ts_datefile, 'r') as fin:
        for line in fin:
            split_line = line.rstrip().split(',')
            if len(split_line) == 2:
                ts, tid = split_line
                sample_tid_set.add(tid)

    for idx, lang in enumerate(lang_list):
        if idx == 0:
            subcrawler_ts_datefiles = [
                os.path.join(archive_dir, 'ts_{0}_{1}.txt'.format(app_name, j))
                for j in [2, 3, 8, 9]
            ]
        else:
            subcrawler_ts_datefiles = [
                os.path.join(archive_dir, 'ts_{0}_{1}.txt'.format(app_name, j))
                for j in [1, 4, 5, 6, 7, 10, 11, 12]
            ]

        num_in = 0
        num_out = 0

        count_sample = np.zeros(shape=(hours_in_day, num_days))
        count_complete = np.zeros(shape=(hours_in_day, num_days))

        visited_tid_set = set()

        for ts_datefile in subcrawler_ts_datefiles:
            with open(ts_datefile, 'r') as fin:
                for line in fin:
                    split_line = line.rstrip().split(',')
                    if len(split_line) == 2:
                        ts, tid = split_line
                        if tid not in visited_tid_set:
                            dt_obj = datetime.utcfromtimestamp(int(ts[:-3]))
                            day_idx = dt_obj.day - 6
                            hour = dt_obj.hour
                            count_complete[hour][day_idx] += 1
                            if tid in sample_tid_set:
                                num_in += 1
                                count_sample[hour][day_idx] += 1
                            else:
                                num_out += 1
                            visited_tid_set.add(tid)

        print(
            'collected tweets: {0}, missing tweets: {1}, sample ratio for lang {2}: {3:.2f}%'
            .format(num_in, num_out, lang, num_in / (num_in + num_out) * 100))

        # hourly tweet volume in youtube for some languages
        sample_volume_mean_list_hour = []
        sample_ub_volume_mean_list_hour = []
        sample_lb_volume_mean_list_hour = []

        complete_volume_mean_list_hour = []
        complete_ub_volume_mean_list_hour = []
        complete_lb_volume_mean_list_hour = []

        for j in range(hours_in_day):
            mean, lb, ub = mean_confidence_interval(count_sample[j, :],
                                                    confidence=0.95)
            sample_volume_mean_list_hour.append(mean)
            sample_lb_volume_mean_list_hour.append(lb)
            sample_ub_volume_mean_list_hour.append(ub)

            mean, lb, ub = mean_confidence_interval(count_complete[j, :],
                                                    confidence=0.95)
            complete_volume_mean_list_hour.append(mean)
            complete_lb_volume_mean_list_hour.append(lb)
            complete_ub_volume_mean_list_hour.append(ub)

        print('tweet volumes from JST-6pm to 12am: {0:.2f}%'.format(
            100 * sum(complete_volume_mean_list_hour[9:15]) /
            sum(complete_volume_mean_list_hour)))
        print('sampling rates from JST-6pm to 12am: {0:.2f}%'.format(
            100 * sum(sample_volume_mean_list_hour[9:15]) /
            sum(complete_volume_mean_list_hour[9:15])))

        hour_x_axis = range(hours_in_day)
        axes[idx].plot(hour_x_axis,
                       complete_volume_mean_list_hour,
                       c='k',
                       lw=1.5,
                       ls='-',
                       zorder=20,
                       label='complete')
        axes[idx].fill_between(hour_x_axis,
                               complete_ub_volume_mean_list_hour,
                               complete_volume_mean_list_hour,
                               facecolor='lightgray',
                               lw=0,
                               zorder=10)
        axes[idx].fill_between(hour_x_axis,
                               complete_lb_volume_mean_list_hour,
                               complete_volume_mean_list_hour,
                               facecolor='lightgray',
                               lw=0,
                               zorder=10)

        axes[idx].plot(hour_x_axis,
                       sample_volume_mean_list_hour,
                       c='k',
                       lw=1.5,
                       ls='--',
                       zorder=20,
                       label='sample')
        axes[idx].fill_between(hour_x_axis,
                               sample_ub_volume_mean_list_hour,
                               sample_volume_mean_list_hour,
                               facecolor=red,
                               alpha=0.8,
                               lw=0,
                               zorder=10)
        axes[idx].fill_between(hour_x_axis,
                               sample_lb_volume_mean_list_hour,
                               sample_volume_mean_list_hour,
                               facecolor=red,
                               alpha=0.8,
                               lw=0,
                               zorder=10)

        if idx == 0:
            axes[idx].plot([9, 9],
                           [complete_ub_volume_mean_list_hour[9], 150000],
                           'k--',
                           lw=1)
            axes[idx].plot([15, 15],
                           [complete_ub_volume_mean_list_hour[15], 150000],
                           'k--',
                           lw=1)
            axes[idx].text(8,
                           150000,
                           'JST-6pm',
                           ha='center',
                           va='bottom',
                           size=16)
            axes[idx].text(16,
                           150000,
                           '12am',
                           ha='center',
                           va='bottom',
                           size=16)
            axes[idx].annotate('',
                               xy=(9, 135000),
                               xycoords='data',
                               xytext=(15, 135000),
                               textcoords='data',
                               arrowprops=dict(arrowstyle='<->',
                                               connectionstyle='arc3'),
                               zorder=50)

        axes[idx].set_xlabel('hour (in UTC)', fontsize=16)
        axes[idx].set_xticks([0, 6, 12, 18, 24])
        axes[idx].set_ylim([0, 180000])
        axes[idx].set_yticks([0, 50000, 100000, 150000])
        axes[idx].yaxis.set_major_formatter(FuncFormatter(concise_fmt))
        axes[idx].tick_params(axis='both', which='major', labelsize=16)
        axes[idx].set_title('({0}) {1}'.format(['a', 'b'][idx],
                                               lang_list[idx]),
                            size=18,
                            pad=-3 * 72,
                            y=1.0001)

    axes[0].set_ylabel('#tweets', fontsize=16)
    axes[1].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='lower right')

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/tweet_lang_vol.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'

    hours_in_day = 24
    minutes_in_hour = 60
    seconds_in_minute = 60
    ms_in_second = 1000

    num_bins = 100
    width = ms_in_second // num_bins

    num_top = 500

    confusion_sampling_rate = np.load('../data/{0}_out/{0}_confusion_sampling_rate.npy'.format(app_name))
    confusion_sampling_rate = np.nan_to_num(confusion_sampling_rate)

    load_external_data = False
    if not load_external_data:
        sample_entity_stats = defaultdict(int)
        with open('../data/{0}_out/user_{0}_all.txt'.format(app_name), 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                sample_entity_stats[split_line[1]] += 1

        # == == == == == == Part 2: Plot entity rank == == == == == == #
        print('>>> found top {0} users in sample set...'.format(num_top))
        sample_top = [kv[0] for kv in sorted(sample_entity_stats.items(), key=lambda x: x[1], reverse=True)[:num_top]]

        # == == == == == == Part 1: Find tweets appearing in complete set == == == == == == #
        complete_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        complete_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)]
        complete_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)]
        complete_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        complete_entity_stats = defaultdict(int)
        with open('../data/{0}_out/complete_user_{0}.txt'.format(app_name), 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                user_id = split_line[1]
                if user_id in sample_top:
                    complete_entity_stats[user_id] += 1

                    user_idx = sample_top.index(user_id)
                    tweet_id = split_line[0]
                    timestamp_ms = melt_snowflake(tweet_id)[0]
                    dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000)
                    hour = dt_obj.hour
                    minute = dt_obj.minute
                    second = dt_obj.second
                    millisec = timestamp_ms % 1000
                    ms_idx = (millisec-7) // width if millisec >= 7 else (1000 + millisec-7) // width

                    complete_post_lists_hour[user_idx][hour] += 1
                    complete_post_lists_min[user_idx][minute] += 1
                    complete_post_lists_sec[user_idx][second] += 1
                    complete_post_lists_10ms[user_idx][ms_idx] += 1

        write_to_file('./complete_post_lists_hour.txt', sample_top, complete_post_lists_hour)
        write_to_file('./complete_post_lists_min.txt', sample_top, complete_post_lists_min)
        write_to_file('./complete_post_lists_sec.txt', sample_top, complete_post_lists_sec)
        write_to_file('./complete_post_lists_10ms.txt', sample_top, complete_post_lists_10ms)

        print('>>> finish dumping complete lists...')
        timer.stop()

        # == == == == == == Part 2: Find appearing tweets in sample set == == == == == == #
        sample_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        sample_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)]
        sample_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)]
        sample_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        estimated_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        estimated_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)]
        estimated_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)]
        estimated_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        hourly_conversion = np.mean(confusion_sampling_rate, axis=(1, 2, 3))
        minutey_conversion = np.mean(confusion_sampling_rate, axis=(2, 3))
        secondly_conversion = np.mean(confusion_sampling_rate, axis=(3))

        with open('../data/{0}_out/user_{0}_all.txt'.format(app_name), 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                user_id = split_line[1]
                if user_id in sample_top:
                    user_idx = sample_top.index(user_id)
                    tweet_id = split_line[0]
                    timestamp_ms = melt_snowflake(tweet_id)[0]
                    dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000)
                    hour = dt_obj.hour
                    minute = dt_obj.minute
                    second = dt_obj.second
                    millisec = timestamp_ms % 1000
                    ms_idx = (millisec-7) // width if millisec >= 7 else (1000 + millisec-7) // width

                    sample_post_lists_hour[user_idx][hour] += 1
                    sample_post_lists_min[user_idx][minute] += 1
                    sample_post_lists_sec[user_idx][second] += 1
                    sample_post_lists_10ms[user_idx][ms_idx] += 1

                    estimated_post_lists_hour[user_idx][hour] += 1 / hourly_conversion[hour]
                    estimated_post_lists_min[user_idx][minute] += 1 / minutey_conversion[hour, minute]
                    estimated_post_lists_sec[user_idx][second] += 1 / secondly_conversion[hour, minute, second]
                    estimated_post_lists_10ms[user_idx][ms_idx] += 1 / confusion_sampling_rate[hour, minute, second, ms_idx]

        write_to_file('./sample_post_lists_hour.txt', sample_top, sample_post_lists_hour)
        write_to_file('./sample_post_lists_min.txt', sample_top, sample_post_lists_min)
        write_to_file('./sample_post_lists_sec.txt', sample_top, sample_post_lists_sec)
        write_to_file('./sample_post_lists_10ms.txt', sample_top, sample_post_lists_10ms)

        write_to_file('./estimated_post_lists_hour.txt', sample_top, estimated_post_lists_hour)
        write_to_file('./estimated_post_lists_min.txt', sample_top, estimated_post_lists_min)
        write_to_file('./estimated_post_lists_sec.txt', sample_top, estimated_post_lists_sec)
        write_to_file('./estimated_post_lists_10ms.txt', sample_top, estimated_post_lists_10ms)

        print('>>> finish dumping sample and estimated lists...')
        timer.stop()
    else:
        sample_top = []
        complete_post_lists_hour = []
        with open('./complete_post_lists_hour.txt', 'r') as fin:
            for line in fin:
                user_id, total, records = line.rstrip().split('\t')
                sample_top.append(user_id)
                records = list(map(int, records.split(',')))
                complete_post_lists_hour.append(records)

        complete_post_lists_min = read_from_file('./complete_post_lists_min.txt', dtype=0)
        complete_post_lists_sec = read_from_file('./complete_post_lists_sec.txt', dtype=0)
        complete_post_lists_10ms = read_from_file('./complete_post_lists_10ms.txt', dtype=0)

        sample_post_lists_hour = read_from_file('./sample_post_lists_hour.txt', dtype=0)
        sample_post_lists_min = read_from_file('./sample_post_lists_min.txt', dtype=0)
        sample_post_lists_sec = read_from_file('./sample_post_lists_sec.txt', dtype=0)
        sample_post_lists_10ms = read_from_file('./sample_post_lists_10ms.txt', dtype=0)

        estimated_post_lists_hour = read_from_file('./estimated_post_lists_hour.txt', dtype=1)
        estimated_post_lists_min = read_from_file('./estimated_post_lists_min.txt', dtype=1)
        estimated_post_lists_sec = read_from_file('./estimated_post_lists_sec.txt', dtype=1)
        estimated_post_lists_10ms = read_from_file('./estimated_post_lists_10ms.txt', dtype=1)

    # == == == == == == Part 3: Find the best estimation by comparing JS distance == == == == == == #
    ret = {}
    num_estimate_list = []
    num_sample_list = []
    num_complete_list = []

    sample_entity_stats = {user_id: sum(sample_post_lists_hour[user_idx]) for user_idx, user_id in enumerate(sample_top)}
    complete_entity_stats = {user_id: sum(complete_post_lists_hour[user_idx]) for user_idx, user_id in enumerate(sample_top)}

    min_mat = np.array([], dtype=np.int64).reshape(0, 60)
    sec_mat = np.array([], dtype=np.int64).reshape(0, 60)

    for user_idx, user_id in enumerate(sample_top):
        num_sample = sample_entity_stats[user_id]
        num_complete = complete_entity_stats[user_id]

        hour_entropy = entropy(sample_post_lists_hour[user_idx], base=hours_in_day)
        min_entropy = entropy(sample_post_lists_min[user_idx], base=minutes_in_hour)
        sec_entropy = entropy(sample_post_lists_sec[user_idx], base=seconds_in_minute)
        ms10_entropy = entropy(sample_post_lists_10ms[user_idx], base=num_bins)

        min_mat = np.vstack((min_mat, np.array(sample_post_lists_min[user_idx]).reshape(1, -1)))
        sec_mat = np.vstack((sec_mat, np.array(sample_post_lists_sec[user_idx]).reshape(1, -1)))

        if ms10_entropy < 0.87:
            min_entropy_idx = 3
        else:
            min_entropy_idx = 0

        num_estimate = sum([estimated_post_lists_hour[user_idx], estimated_post_lists_min[user_idx],
                            estimated_post_lists_sec[user_idx], estimated_post_lists_10ms[user_idx]][min_entropy_idx])
        num_estimate_list.append(num_estimate)

        num_sample_list.append(num_sample)
        num_complete_list.append(num_complete)

        ret[user_id] = (num_sample, num_complete, num_estimate, min_entropy_idx)

    # == == == == == == Part 3: Plot case users == == == == == == #
    case_user_ids = ['1033778124968865793', '1182605743335211009']
    case_user_screennames = ['WeltRadio', 'bensonbersk']

    fig, axes = plt.subplots(1, 2, figsize=(7.2, 2.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]
    red = cc4[3]
    filled_colors = [blue, red]
    labels = ['(c)', '(d)']
    for ax_idx, user_id in enumerate(case_user_ids):
        user_idx = sample_top.index(user_id)
        min_entropy_idx = ret[user_id][-1]

        if min_entropy_idx == 0:
            axes[ax_idx].bar(range(hours_in_day), complete_post_lists_hour[user_idx], color='lightgray', width=1)
            axes[ax_idx].bar(range(hours_in_day), sample_post_lists_hour[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1)
            axes[ax_idx].plot(range(hours_in_day), estimated_post_lists_hour[user_idx], 'k-', lw=1.5)
            axes[ax_idx].set_xlabel('hour', fontsize=12)
            axes[ax_idx].set_xlim([-1, hours_in_day+1])
            axes[ax_idx].set_xticks([0, 6, 12, 18, 24])
            axes[ax_idx].set_title('{0} {1}'.format(labels[ax_idx], case_user_screennames[ax_idx]), fontsize=13)
        elif min_entropy_idx == 1:
            axes[ax_idx].bar(range(minutes_in_hour), complete_post_lists_min[user_idx], color='lightgray', width=1)
            axes[ax_idx].bar(range(minutes_in_hour), sample_post_lists_min[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1)
            axes[ax_idx].plot(range(minutes_in_hour), estimated_post_lists_min[user_idx], 'k-', lw=1.5)
            axes[ax_idx].set_xlabel('minute', fontsize=12)
            axes[ax_idx].set_xlim([-1, minutes_in_hour+1])
            axes[ax_idx].set_xticks([0, 15, 30, 45, 60])
        elif min_entropy_idx == 2:
            axes[ax_idx].bar(range(seconds_in_minute), complete_post_lists_sec[user_idx], color='lightgray', width=1)
            axes[ax_idx].bar(range(seconds_in_minute), sample_post_lists_sec[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1)
            axes[ax_idx].plot(range(seconds_in_minute), estimated_post_lists_sec[user_idx], 'k-', lw=1.5)
            axes[ax_idx].set_xlabel('second', fontsize=12)
            axes[ax_idx].set_xlim([-1, seconds_in_minute+1])
            axes[ax_idx].set_xticks([0, 15, 30, 45, 60])
        elif min_entropy_idx == 3:
            axes[ax_idx].bar(range(num_bins), complete_post_lists_10ms[user_idx], color='lightgray', width=1)
            axes[ax_idx].bar(range(num_bins), sample_post_lists_10ms[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1)
            axes[ax_idx].plot(range(num_bins), estimated_post_lists_10ms[user_idx], 'k-', lw=1.5)
            axes[ax_idx].set_xlabel('millisecond', fontsize=12)
            axes[ax_idx].set_xlim([-3, num_bins+3])
            axes[ax_idx].set_xticks([0, 25, 50, 75, 100])
            axes[ax_idx].xaxis.set_major_formatter(FuncFormatter(lambda x, _: 10*x))

        axes[ax_idx].tick_params(axis='both', which='major', labelsize=11)
        axes[ax_idx].set_title('{0} {1}'.format(labels[ax_idx], case_user_screennames[ax_idx]), fontsize=13)

    axes[0].set_ylabel('volume', fontsize=12)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/suspicious_users.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
Exemplo n.º 8
0
def main():
    timer = Timer()
    timer.start()

    cc4 = ColorPalette.CC4
    blue = cc4[0]
    red = cc4[3]

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    num_days = 14
    hours_in_day = 24
    hour_x_axis = range(hours_in_day)
    minutes_in_hour = 60
    seconds_in_minute = 60
    ms_in_second = 1000
    ms_bins = 100
    width = ms_in_second // ms_bins
    ms_x_axis = range(ms_in_second)

    app_conf = {
        'cyberbullying': {
            'min_date': '2019-10-13',
            'label': 'Cyberbullying',
            'color': blue
        },
        'youtube': {
            'min_date': '2019-11-06',
            'label': 'YouTube',
            'color': red
        }
    }

    for app_name in app_conf.keys():
        archive_dir = './{0}_out'.format(app_name)

        min_date = datetime.strptime(app_conf[app_name]['min_date'],
                                     '%Y-%m-%d').replace(tzinfo=timezone.utc)
        min_timestamp = int(min_date.timestamp())
        min_day = min_date.day
        sample_datefile = open(
            os.path.join(archive_dir, 'ts_{0}_all.txt'.format(app_name)), 'r')
        complete_datefile = open(
            os.path.join(archive_dir, 'complete_ts_{0}.txt'.format(app_name)),
            'r')

        sample_tid_set = set()

        hour_hit_mat = np.zeros(shape=(hours_in_day, num_days))
        hour_miss_mat = np.zeros(shape=(hours_in_day, num_days))

        ms_hit_mat = np.zeros(shape=(ms_in_second, num_days * hours_in_day))
        ms_miss_mat = np.zeros(shape=(ms_in_second, num_days * hours_in_day))

        confusion_hit_mat = np.zeros(shape=(hours_in_day, minutes_in_hour,
                                            seconds_in_minute, ms_bins))
        confusion_miss_mat = np.zeros(shape=(hours_in_day, minutes_in_hour,
                                             seconds_in_minute, ms_bins))

        for line in sample_datefile:
            split_line = line.rstrip().split(',')
            if len(split_line) == 2:
                sample_tid_set.add(split_line[1])

        for line in complete_datefile:
            split_line = line.rstrip().split(',')
            if len(split_line) == 2:
                timestamp_ms = int(split_line[0][:-3])
                if timestamp_ms >= min_timestamp:
                    dt_obj = datetime.utcfromtimestamp(timestamp_ms)
                    day_idx = dt_obj.day - min_day
                    hour = dt_obj.hour
                    minute = dt_obj.minute
                    second = dt_obj.second
                    millisec = int(split_line[0][-3:])
                    ms_idx = (millisec - 7) // width if millisec >= 7 else (
                        ms_in_second + millisec - 7) // width

                    if split_line[1] in sample_tid_set:
                        hour_hit_mat[hour][day_idx] += 1
                        ms_hit_mat[millisec][hours_in_day * day_idx +
                                             hour] += 1
                        confusion_hit_mat[hour][minute][second][ms_idx] += 1
                    else:
                        hour_miss_mat[hour][day_idx] += 1
                        ms_miss_mat[millisec][hours_in_day * day_idx +
                                              hour] += 1
                        confusion_miss_mat[hour][minute][second][ms_idx] += 1

        # hourly tweet sampling rate
        rho_mean_list_hour = []
        ub_rho_mean_list_hour = []
        lb_rho_mean_list_hour = []

        for i in hour_x_axis:
            mean, lb, ub = mean_confidence_interval(
                hour_hit_mat[i, :] /
                (hour_hit_mat[i, :] + hour_miss_mat[i, :]),
                confidence=0.95)
            rho_mean_list_hour.append(mean)
            lb_rho_mean_list_hour.append(lb)
            ub_rho_mean_list_hour.append(ub)

        # confusion sampling rate
        confusion_sampling_rate = confusion_hit_mat / (confusion_hit_mat +
                                                       confusion_miss_mat)
        confusion_sampling_rate = np.nan_to_num(confusion_sampling_rate)
        np.save(
            os.path.join(archive_dir,
                         '{0}_confusion_sampling_rate.npy'.format(app_name)),
            confusion_sampling_rate)

        axes[0].plot(hour_x_axis,
                     rho_mean_list_hour,
                     c='k',
                     lw=1.5,
                     ls='-',
                     zorder=20)
        axes[0].fill_between(hour_x_axis,
                             ub_rho_mean_list_hour,
                             rho_mean_list_hour,
                             facecolor=app_conf[app_name]['color'],
                             alpha=0.8,
                             lw=0,
                             zorder=10)
        axes[0].fill_between(hour_x_axis,
                             lb_rho_mean_list_hour,
                             rho_mean_list_hour,
                             facecolor=app_conf[app_name]['color'],
                             alpha=0.8,
                             lw=0,
                             zorder=10,
                             label='{0}'.format(app_conf[app_name]['label']))

        # msly tweet sampling rate
        rho_mean_list_ms = []
        ub_rho_mean_list_ms = []
        lb_rho_mean_list_ms = []

        for i in ms_x_axis:
            mean, lb, ub = mean_confidence_interval(
                ms_hit_mat[i, :] / (ms_hit_mat[i, :] + ms_miss_mat[i, :]),
                confidence=0.95)
            rho_mean_list_ms.append(mean)
            lb_rho_mean_list_ms.append(lb)
            ub_rho_mean_list_ms.append(ub)

        axes[1].plot(ms_x_axis,
                     rho_mean_list_ms,
                     c='k',
                     lw=1.5,
                     ls='-',
                     zorder=20)
        axes[1].fill_between(ms_x_axis,
                             ub_rho_mean_list_ms,
                             rho_mean_list_ms,
                             facecolor=app_conf[app_name]['color'],
                             alpha=0.8,
                             lw=0,
                             zorder=10)
        axes[1].fill_between(ms_x_axis,
                             lb_rho_mean_list_ms,
                             rho_mean_list_ms,
                             facecolor=app_conf[app_name]['color'],
                             alpha=0.8,
                             lw=0,
                             zorder=10)

    axes[0].set_xticks([0, 6, 12, 18, 24])
    axes[0].set_xlabel('hour (in UTC)', fontsize=16)
    axes[0].set_ylim([-0.05, 1.05])
    axes[0].set_yticks([0, 0.25, 0.5, 0.75, 1.0])
    axes[0].set_ylabel(r'sampling rate $\rho_t$', fontsize=16)
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True)
    axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001)

    axes[1].axvline(x=657, ymin=0, ymax=0.4, c='k', ls='--')
    axes[1].text(667, 0.2, 'x=657', size=18, ha='left', va='center')
    axes[1].set_xticks([0, 250, 500, 750, 1000])
    axes[1].set_xlabel('millisecond', fontsize=16)
    axes[1].set_ylim([-0.05, 1.05])
    axes[1].set_yticks([0, 0.25, 0.5, 0.75, 1.0])
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/temporal_sampling_rates.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()