def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'

    sample_cascade_size = {}
    sample_inter_arrival_time = []
    sample_cascade_influence = {}
    sample_cascade_influence_10m = defaultdict(int)
    sample_cascade_influence_1h = defaultdict(int)
    with open('../data/{0}_out/sample_retweet_{0}.txt'.format(app_name),
              'r') as fin:
        for line in fin:
            root_tweet, cascades = line.rstrip().split(':')
            cascades = cascades.split(',')
            root_tweet = root_tweet.split('-')[0]
            retweets = [x.split('-')[0] for x in cascades]
            influences = [int(x.split('-')[1]) for x in cascades]
            sample_cascade_size[root_tweet] = len(retweets)
            sample_cascade_influence[root_tweet] = sum(influences)
            root_timestamp = melt_snowflake(root_tweet)[0] / 1000
            retweet_timestamp_list = [root_timestamp]

            for i in range(len(retweets)):
                retweet_time = melt_snowflake(retweets[i])[0] / 1000
                relative_retweet_time = retweet_time - root_timestamp
                retweet_timestamp_list.append(
                    melt_snowflake(retweets[i])[0] / 1000)
                if relative_retweet_time < 10 * 60:
                    sample_cascade_influence_10m[root_tweet] += influences[i]
                if relative_retweet_time < 60 * 60:
                    sample_cascade_influence_1h[root_tweet] += influences[i]

            for i in range(len(retweet_timestamp_list) - 1):
                sample_inter_arrival_time.append(retweet_timestamp_list[i +
                                                                        1] -
                                                 retweet_timestamp_list[i])

    complete_cascade_size = {}
    complete_inter_arrival_time = []
    complete_cascade_influence = {}
    complete_cascade_influence_10m = defaultdict(int)
    complete_cascade_influence_1h = defaultdict(int)
    with open('../data/{0}_out/complete_retweet_{0}.txt'.format(app_name),
              'r') as fin:
        for line in fin:
            root_tweet, cascades = line.rstrip().split(':')
            cascades = cascades.split(',')
            root_tweet = root_tweet.split('-')[0]
            retweets = [x.split('-')[0] for x in cascades]
            complete_cascade_size[root_tweet] = len(retweets)
            if len(retweets) >= 50:
                influences = [int(x.split('-')[1]) for x in cascades]
                complete_cascade_influence[root_tweet] = sum(influences)
                root_timestamp = melt_snowflake(root_tweet)[0] / 1000
                retweet_timestamp_list = [root_timestamp]

                for i in range(len(retweets)):
                    retweet_time = melt_snowflake(retweets[i])[0] / 1000
                    relative_retweet_time = retweet_time - root_timestamp
                    retweet_timestamp_list.append(
                        melt_snowflake(retweets[i])[0] / 1000)
                    if relative_retweet_time < 10 * 60:
                        complete_cascade_influence_10m[
                            root_tweet] += influences[i]
                    if relative_retweet_time < 60 * 60:
                        complete_cascade_influence_1h[
                            root_tweet] += influences[i]

                for i in range(len(retweet_timestamp_list) - 1):
                    complete_inter_arrival_time.append(
                        retweet_timestamp_list[i + 1] -
                        retweet_timestamp_list[i])

    print('number of cascades in the complete set', len(complete_cascade_size))
    print('number of cascades in the sample set', len(sample_cascade_size))

    print('mean complete size', np.mean(list(complete_cascade_size.values())))
    print('mean sample size', np.mean(list(sample_cascade_size.values())))

    print('complete #cascades (≥50 retweets)',
          sum([1 for x in list(complete_cascade_size.values()) if x >= 50]))
    print('sample #cascades (≥50 retweets)',
          sum([1 for x in list(sample_cascade_size.values()) if x >= 50]))

    num_complete_cascades_in_sample = 0
    complete_cascades_in_sample_size_list = []
    num_complete_cascades_in_sample_50 = 0
    for root_tweet in sample_cascade_size:
        if sample_cascade_size[root_tweet] == complete_cascade_size[
                root_tweet]:
            num_complete_cascades_in_sample += 1
            complete_cascades_in_sample_size_list.append(
                complete_cascade_size[root_tweet])
            if complete_cascade_size[root_tweet] >= 50:
                num_complete_cascades_in_sample_50 += 1
    print('number of complete cascades in the sample set',
          num_complete_cascades_in_sample)
    print('number of complete cascades (>50 retweets) in the sample set',
          num_complete_cascades_in_sample_50)
    print('max: {0}, mean: {1}'.format(
        max(complete_cascades_in_sample_size_list),
        np.mean(complete_cascades_in_sample_size_list)))

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]
    red = cc4[3]

    sample_median = np.median(sample_inter_arrival_time)
    complete_median = np.median(complete_inter_arrival_time)

    plot_ccdf(sample_inter_arrival_time,
              ax=axes[0],
              color=blue,
              ls='-',
              label='sample')
    plot_ccdf(complete_inter_arrival_time,
              ax=axes[0],
              color='k',
              ls='-',
              label='complete')

    axes[0].plot([sample_median, sample_median], [0, 1],
                 color=blue,
                 ls='--',
                 lw=1)
    axes[0].plot([complete_median, complete_median], [0, 1],
                 color='k',
                 ls='--',
                 lw=1)

    print('\ninter_arrival_time sample median', sample_median)
    print('inter_arrival_time complete median', complete_median)

    axes[0].set_xscale('symlog')
    axes[0].set_xticks([0, 1, 100, 10000, 1000000])
    axes[0].set_yscale('linear')
    axes[0].set_xlabel('inter-arrival time (sec)', fontsize=16)
    axes[0].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[0].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='upper right')
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', fontsize=18, pad=-3 * 72, y=1.0001)

    influence_list = []
    influence_list_10m = []
    influence_list_1h = []
    for root_tweet in sample_cascade_size:
        if complete_cascade_size[root_tweet] >= 50:
            if complete_cascade_influence[root_tweet] > 0:
                influence_list.append(sample_cascade_influence[root_tweet] /
                                      complete_cascade_influence[root_tweet])
            if complete_cascade_influence_10m[root_tweet] > 0:
                influence_list_10m.append(
                    sample_cascade_influence_10m[root_tweet] /
                    complete_cascade_influence_10m[root_tweet])
            if complete_cascade_influence_1h[root_tweet] > 0:
                influence_list_1h.append(
                    sample_cascade_influence_1h[root_tweet] /
                    complete_cascade_influence_1h[root_tweet])

    plot_ccdf(influence_list_10m, ax=axes[1], color=red, ls='-', label='10m')
    plot_ccdf(influence_list_1h, ax=axes[1], color=blue, ls='-', label='1h')
    plot_ccdf(influence_list, ax=axes[1], color='k', ls='-', label='14d')

    print('influence_list median', np.median(influence_list))
    print('influence_list_1h median', np.median(influence_list_1h))
    print('influence_list_10m median', np.median(influence_list_10m))

    print('influence_list 0.25', percentileofscore(influence_list, 0.25))
    print('influence_list 0.25', percentileofscore(influence_list_1h, 0.25))
    print('influence_list 0.25', percentileofscore(influence_list_10m, 0.25))

    print('influence_list 0.75', percentileofscore(influence_list, 0.75))
    print('influence_list 0.75', percentileofscore(influence_list_1h, 0.75))
    print('influence_list 0.75', percentileofscore(influence_list_10m, 0.75))

    axes[1].set_xscale('linear')
    axes[1].set_yscale('linear')
    axes[1].set_xlabel('relative potential reach', fontsize=16)
    # axes[1].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[1].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='upper right')
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].set_title('(b)', fontsize=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/cascades_measures.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_names = ['cyberbullying', 'youtube']
    # data for plot subfig (a)
    showcase_segment_idx = 0
    showcase_complete_tid_list = []
    showcase_retrieved_tid_list = []
    showcase_ratemsg_list = []
    showcase_track_list = []

    # data for plot subfig (b)
    mape_dict = {app_name: [] for app_name in app_names}

    rate_silence_length = 10000
    disconnect_silence_length = 180000
    print('>>> We silence {0} seconds around rate limit messages'.format(
        rate_silence_length // 1000))
    print('>>> We silence {0} seconds proceeding disconnect messages\n'.format(
        disconnect_silence_length // 1000))

    for app_name in app_names:
        print('>>> Computing on app {0}'.format(app_name))
        archive_dir = './{0}_out'.format(app_name)

        sample_input_path = os.path.join(archive_dir,
                                         'ts_{0}_all.txt'.format(app_name))
        complete_input_path = os.path.join(
            archive_dir, 'complete_ts_{0}.txt'.format(app_name))

        # == == == == == == Part 1: Initially select segments in the complete set == == == == == == #
        # segments that silence 10s around rate limit messages and 180s proceeding disconnect messages in complete set
        init_segment_list = []
        init_start_ts = 0
        with open(complete_input_path, 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                # if it is a disconnect msg
                if 'disconnect' in split_line[1]:
                    disconnect_ts = int(split_line[0])
                    # disconnect message, remove the proceeding [disconnect_silence_length]
                    init_end_ts = disconnect_ts - disconnect_silence_length
                    if init_end_ts > init_start_ts:
                        init_segment_list.append((init_start_ts, init_end_ts,
                                                  init_end_ts - init_start_ts))
                    init_start_ts = disconnect_ts
                # elif it is a rate limit msg
                elif 'ratemsg' in split_line[1]:
                    ratemsg_ts = int(split_line[0])
                    # rate limit message, remove the surrounding [rate_silence_length]
                    init_end_ts = ratemsg_ts - rate_silence_length // 2
                    if init_end_ts > init_start_ts:
                        init_segment_list.append((init_start_ts, init_end_ts,
                                                  init_end_ts - init_start_ts))
                    init_start_ts = ratemsg_ts + rate_silence_length // 2
        print(
            '>>> Initially, we identify {0} segments in complete set without rate limit message'
            .format(len(init_segment_list)))
        # print(init_segment_list[: 10])

        # == == == == == == Part 2: Segments are bounded by 2 rate limit messages in the sample set == == == == == == #
        bounded_segment_list = []
        current_segment_idx = 0
        current_start_ts = 0
        current_ratemsg_list = []
        current_track_list = []
        last_ratemsg_ts = 0

        look_for_end = False
        found_showcase = False

        with open(sample_input_path, 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                if 'ratemsg' in split_line[1]:
                    ratemsg_ts = int(split_line[0])
                    track = int(split_line[2])
                    if not look_for_end or (
                            look_for_end
                            and current_start_ts == last_ratemsg_ts
                            and init_segment_list[current_segment_idx][1] <
                            ratemsg_ts):
                        # fast forward, skip some really short segments
                        while ratemsg_ts >= init_segment_list[
                                current_segment_idx][1]:
                            current_segment_idx += 1
                        if current_segment_idx == len(init_segment_list):
                            break
                        if ratemsg_ts >= init_segment_list[
                                current_segment_idx][0]:
                            current_start_ts = ratemsg_ts
                            current_ratemsg_list = [ratemsg_ts]
                            current_track_list = [track]
                            look_for_end = True
                        else:
                            look_for_end = False
                    elif look_for_end:
                        if current_start_ts < last_ratemsg_ts <= init_segment_list[
                                current_segment_idx][1] < ratemsg_ts:
                            current_num_miss = count_track(
                                current_track_list,
                                start_with_rate=True,
                                subcrawler=False)
                            bounded_segment_list.append(
                                (current_start_ts, last_ratemsg_ts,
                                 last_ratemsg_ts - current_start_ts,
                                 current_num_miss))

                            # find the first example segment that is around 11 sec long
                            if app_name == 'cyberbullying' and not found_showcase and 10000 <= last_ratemsg_ts - current_start_ts <= 12000:
                                showcase_segment_idx = len(
                                    bounded_segment_list) - 1
                                showcase_ratemsg_list = copy.deepcopy(
                                    current_ratemsg_list)
                                showcase_track_list = copy.deepcopy(
                                    current_track_list)
                                found_showcase = True

                            current_segment_idx += 1
                            if current_segment_idx == len(init_segment_list):
                                break
                            if ratemsg_ts >= init_segment_list[
                                    current_segment_idx][0]:
                                current_start_ts = ratemsg_ts
                                current_ratemsg_list = [ratemsg_ts]
                                current_track_list = [track]
                                look_for_end = True
                            else:
                                look_for_end = False
                        else:
                            current_ratemsg_list.append(ratemsg_ts)
                            current_track_list.append(track)

                    last_ratemsg_ts = ratemsg_ts
                    if current_segment_idx == len(init_segment_list):
                        break
        print('>>> We further bound {0} segments with 2 rate limit messages'.
              format(len(bounded_segment_list)))
        # print(bounded_segment_list[-10:])

        # == == == == == == Part 3: Add sample and complete volume in each segment == == == == == == #
        for input_path, tid_list in zip(
            [sample_input_path, complete_input_path],
            [showcase_retrieved_tid_list, showcase_complete_tid_list]):
            current_segment_idx = 0
            current_segment_cnt = 0
            with open(input_path, 'r') as fin:
                for line in fin:
                    split_line = line.rstrip().split(',')
                    if len(split_line) == 2:
                        msg_ts = int(split_line[0])
                        if bounded_segment_list[current_segment_idx][
                                0] < msg_ts <= bounded_segment_list[
                                    current_segment_idx][1]:
                            current_segment_cnt += 1

                            if app_name == 'cyberbullying' and current_segment_idx == showcase_segment_idx:
                                tweet_id = split_line[1]
                                tid_list.append(tweet_id)
                        elif msg_ts > bounded_segment_list[
                                current_segment_idx][1]:
                            bounded_segment_list[current_segment_idx] = (
                                *bounded_segment_list[current_segment_idx],
                                current_segment_cnt)
                            current_segment_idx += 1
                            current_segment_cnt = 0
                            if current_segment_idx == len(
                                    bounded_segment_list):
                                break
            # print(bounded_segment_list[-10:])

        length_tracker = 0
        mape_list = []
        for segment in bounded_segment_list:
            length_tracker += segment[2]
            mape_list.append(mape(segment[-1], segment[-2] + segment[-3]))
        mape_dict[app_name] = copy.deepcopy(mape_list)
        print('MAPE: {0:.5f} +- {1:.5f}, median: {2:.5f}'.format(
            np.mean(mape_list), np.std(mape_list), np.median(mape_list)))
        print('total tracked days bounded: {0:.2f} out of 14'.format(
            length_tracker / 1000 / 60 / 60 / 24))

        if app_name == 'cyberbullying':
            print(
                'complete tweets: {0}, retrieved tweets: {1}, estimated missing: {2}'
                .format(
                    len(showcase_complete_tid_list),
                    len(showcase_retrieved_tid_list),
                    count_track(showcase_track_list,
                                start_with_rate=True,
                                subcrawler=False)))
            print('ratemsg timestamp', showcase_ratemsg_list)
            print('ratemsg track', showcase_track_list)
        print()

    timer.stop()

    # == == == == == == Part 5: Plot a showcase segment that is roughly 10s == == == == == == #
    cc4 = ColorPalette.CC4
    blue = cc4[0]
    green = cc4[1]
    red = cc4[3]
    fig, axes = plt.subplots(1, 4, figsize=(12, 1.6))
    ax2 = axes[-1]
    gs = axes[1].get_gridspec()
    for ax in axes[:-1]:
        ax.remove()
    ax1 = fig.add_subplot(gs[:-1])

    # add a timeline
    ax1.axhline(0, linewidth=2, color='k')

    observed_tweet_ts_list = sorted(
        [melt_snowflake(tid)[0] for tid in showcase_retrieved_tid_list])
    showcase_missing_tid_set = set(showcase_complete_tid_list).difference(
        set(showcase_retrieved_tid_list))
    missing_tweet_ts_list = sorted(
        [melt_snowflake(tid)[0] for tid in showcase_missing_tid_set])
    ax1.scatter(observed_tweet_ts_list, [1] * len(observed_tweet_ts_list),
                marker='o',
                facecolors='none',
                edgecolors=blue,
                lw=1,
                s=20)
    ax1.scatter(missing_tweet_ts_list, [0.5] * len(missing_tweet_ts_list),
                marker='x',
                c='k',
                lw=1,
                s=20)
    # stats for missing tweets, cut by rate limit msg timestamp_ms
    complete_track_list = []
    i, j, curr_cnt = 0, 1, 0
    while i < len(missing_tweet_ts_list) and j < len(showcase_ratemsg_list):
        if missing_tweet_ts_list[i] <= showcase_ratemsg_list[j]:
            curr_cnt += 1
            i += 1
        else:
            complete_track_list.append(curr_cnt)
            curr_cnt = 0
            j += 1
    complete_track_list.append(curr_cnt)
    # print(complete_track_list)

    for idx, ts in enumerate(showcase_ratemsg_list):
        ax1.axvline(ts, ymin=0, ymax=1.1, linewidth=1, color='k')

    for idx, ts in enumerate(showcase_ratemsg_list[1:]):
        ax1.text(ts - 50,
                 0.42,
                 '/{0:>3}'.format(complete_track_list[idx]),
                 color='k',
                 ha='right',
                 va='top',
                 size=10)
        ax1.text(ts - 470,
                 0.42,
                 str(showcase_track_list[idx + 1] - showcase_track_list[idx]),
                 color=green,
                 ha='right',
                 va='top',
                 size=10)

    ax1.xaxis.set_major_formatter(FuncFormatter(to_datetime))
    ax1.set_xlim(left=showcase_ratemsg_list[0] - 200,
                 right=showcase_ratemsg_list[-1] + 200)
    ax1.set_yticks([0.5, 1.0])
    ax1.set_ylim(top=1.2, bottom=0)
    num_missing_by_counting = len(showcase_complete_tid_list) - len(
        showcase_retrieved_tid_list)
    num_missing_by_estimating = count_track(showcase_track_list,
                                            start_with_rate=True,
                                            subcrawler=False)
    num_observed_tweets = len(showcase_retrieved_tid_list)
    ax1.tick_params(axis='x', which='major', labelsize=10)
    ax1.tick_params(axis='y', which='both', length=0)
    ax1.set_yticklabels([
        'missing tweets\n{0}/{1}'.format(num_missing_by_estimating,
                                         num_missing_by_counting),
        'collected tweets\n{0}'.format(num_observed_tweets)
    ],
                        fontsize=10)

    # remove borders
    ax1.spines['right'].set_visible(False)
    ax1.spines['left'].set_visible(False)
    ax1.spines['top'].set_visible(False)
    ax1.spines['bottom'].set_visible(False)
    ax1.set_title('(a)', fontsize=11, pad=-1.35 * 72, y=1.0001)

    bplot = ax2.boxplot([mape_dict['cyberbullying'], mape_dict['youtube']],
                        labels=['Cyberbullying', 'YouTube'],
                        widths=0.5,
                        showfliers=False,
                        showmeans=False,
                        patch_artist=True)

    for patch, color in zip(bplot['boxes'], [blue, red]):
        patch.set_facecolor(color)

    for median in bplot['medians']:
        median.set(color='k', linewidth=1)

    ax2.tick_params(axis='both', which='major', labelsize=10)
    ax2.set_ylabel('MAPE', fontsize=10)
    ax2.spines['right'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax2.set_title('(b)', fontsize=11, pad=-1.35 * 72, y=1.0001)

    plt.tight_layout(rect=[0, 0.03, 1, 1])
    plt.savefig('../images/validate_ratemsg.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'

    hours_in_day = 24
    minutes_in_hour = 60
    seconds_in_minute = 60
    ms_in_second = 1000

    num_bins = 100
    width = ms_in_second // num_bins

    num_top = 500

    fig, axes = plt.subplots(1,
                             2,
                             figsize=(7.2, 4.8),
                             gridspec_kw={'width_ratios': [2.75, 3]})
    axes = axes.ravel()

    confusion_sampling_rate = np.load(
        '../data/{0}_out/{0}_confusion_sampling_rate.npy'.format(app_name))
    confusion_sampling_rate = np.nan_to_num(confusion_sampling_rate)

    load_external_data = True
    if not load_external_data:
        sample_entity_stats = defaultdict(int)
        with open('../data/{0}_out/user_{0}_all.txt'.format(app_name),
                  'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                sample_entity_stats[split_line[1]] += 1

        # == == == == == == Part 2: Plot entity rank == == == == == == #
        print('>>> found top {0} users in sample set...'.format(num_top))
        sample_top = [
            kv[0] for kv in sorted(sample_entity_stats.items(),
                                   key=lambda x: x[1],
                                   reverse=True)[:num_top]
        ]

        # == == == == == == Part 1: Find tweets appearing in complete set == == == == == == #
        complete_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        complete_post_lists_min = [[0] * minutes_in_hour
                                   for _ in range(num_top)]
        complete_post_lists_sec = [[0] * seconds_in_minute
                                   for _ in range(num_top)]
        complete_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        complete_entity_stats = defaultdict(int)
        with open('../data/{0}_out/complete_user_{0}.txt'.format(app_name),
                  'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                user_id = split_line[1]
                if user_id in sample_top:
                    complete_entity_stats[user_id] += 1

                    user_idx = sample_top.index(user_id)
                    tweet_id = split_line[0]
                    timestamp_ms = melt_snowflake(tweet_id)[0]
                    dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000)
                    hour = dt_obj.hour
                    minute = dt_obj.minute
                    second = dt_obj.second
                    millisec = timestamp_ms % 1000
                    ms_idx = (millisec - 7) // width if millisec >= 7 else (
                        1000 + millisec - 7) // width

                    complete_post_lists_hour[user_idx][hour] += 1
                    complete_post_lists_min[user_idx][minute] += 1
                    complete_post_lists_sec[user_idx][second] += 1
                    complete_post_lists_10ms[user_idx][ms_idx] += 1

        write_to_file('./complete_post_lists_hour.txt', sample_top,
                      complete_post_lists_hour)
        write_to_file('./complete_post_lists_min.txt', sample_top,
                      complete_post_lists_min)
        write_to_file('./complete_post_lists_sec.txt', sample_top,
                      complete_post_lists_sec)
        write_to_file('./complete_post_lists_10ms.txt', sample_top,
                      complete_post_lists_10ms)

        print('>>> finish dumping complete lists...')
        timer.stop()

        # == == == == == == Part 2: Find appearing tweets in sample set == == == == == == #
        sample_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        sample_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)]
        sample_post_lists_sec = [[0] * seconds_in_minute
                                 for _ in range(num_top)]
        sample_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        estimated_post_lists_hour = [[0] * hours_in_day
                                     for _ in range(num_top)]
        estimated_post_lists_min = [[0] * minutes_in_hour
                                    for _ in range(num_top)]
        estimated_post_lists_sec = [[0] * seconds_in_minute
                                    for _ in range(num_top)]
        estimated_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        hourly_conversion = np.mean(confusion_sampling_rate, axis=(1, 2, 3))
        minutey_conversion = np.mean(confusion_sampling_rate, axis=(2, 3))
        secondly_conversion = np.mean(confusion_sampling_rate, axis=(3))

        with open('../data/{0}_out/user_{0}_all.txt'.format(app_name),
                  'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                user_id = split_line[1]
                if user_id in sample_top:
                    user_idx = sample_top.index(user_id)
                    tweet_id = split_line[0]
                    timestamp_ms = melt_snowflake(tweet_id)[0]
                    dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000)
                    hour = dt_obj.hour
                    minute = dt_obj.minute
                    second = dt_obj.second
                    millisec = timestamp_ms % 1000
                    ms_idx = (millisec - 7) // width if millisec >= 7 else (
                        1000 + millisec - 7) // width

                    sample_post_lists_hour[user_idx][hour] += 1
                    sample_post_lists_min[user_idx][minute] += 1
                    sample_post_lists_sec[user_idx][second] += 1
                    sample_post_lists_10ms[user_idx][ms_idx] += 1

                    estimated_post_lists_hour[user_idx][
                        hour] += 1 / hourly_conversion[hour]
                    estimated_post_lists_min[user_idx][
                        minute] += 1 / minutey_conversion[hour, minute]
                    estimated_post_lists_sec[user_idx][
                        second] += 1 / secondly_conversion[hour, minute,
                                                           second]
                    estimated_post_lists_10ms[user_idx][
                        ms_idx] += 1 / confusion_sampling_rate[hour, minute,
                                                               second, ms_idx]

        write_to_file('./sample_post_lists_hour.txt', sample_top,
                      sample_post_lists_hour)
        write_to_file('./sample_post_lists_min.txt', sample_top,
                      sample_post_lists_min)
        write_to_file('./sample_post_lists_sec.txt', sample_top,
                      sample_post_lists_sec)
        write_to_file('./sample_post_lists_10ms.txt', sample_top,
                      sample_post_lists_10ms)

        write_to_file('./estimated_post_lists_hour.txt', sample_top,
                      estimated_post_lists_hour)
        write_to_file('./estimated_post_lists_min.txt', sample_top,
                      estimated_post_lists_min)
        write_to_file('./estimated_post_lists_sec.txt', sample_top,
                      estimated_post_lists_sec)
        write_to_file('./estimated_post_lists_10ms.txt', sample_top,
                      estimated_post_lists_10ms)

        print('>>> finish dumping sample and estimated lists...')
        timer.stop()
    else:
        sample_top = []
        complete_post_lists_hour = []
        with open('./complete_post_lists_hour.txt', 'r') as fin:
            for line in fin:
                user_id, total, records = line.rstrip().split('\t')
                sample_top.append(user_id)
                records = list(map(int, records.split(',')))
                complete_post_lists_hour.append(records)

        sample_post_lists_hour = read_from_file('./sample_post_lists_hour.txt',
                                                dtype=0)
        sample_post_lists_min = read_from_file('./sample_post_lists_min.txt',
                                               dtype=0)
        sample_post_lists_sec = read_from_file('./sample_post_lists_sec.txt',
                                               dtype=0)
        sample_post_lists_10ms = read_from_file('./sample_post_lists_10ms.txt',
                                                dtype=0)

        estimated_post_lists_hour = read_from_file(
            './estimated_post_lists_hour.txt', dtype=1)
        estimated_post_lists_min = read_from_file(
            './estimated_post_lists_min.txt', dtype=1)
        estimated_post_lists_sec = read_from_file(
            './estimated_post_lists_sec.txt', dtype=1)
        estimated_post_lists_10ms = read_from_file(
            './estimated_post_lists_10ms.txt', dtype=1)

    # == == == == == == Part 3: Find the best estimation by comparing JS distance == == == == == == #
    ret = {}
    num_estimate_list = []
    num_sample_list = []
    num_complete_list = []

    sample_entity_stats = {
        user_id: sum(sample_post_lists_hour[user_idx])
        for user_idx, user_id in enumerate(sample_top)
    }
    complete_entity_stats = {
        user_id: sum(complete_post_lists_hour[user_idx])
        for user_idx, user_id in enumerate(sample_top)
    }

    min_mat = np.array([], dtype=np.int64).reshape(0, 60)
    sec_mat = np.array([], dtype=np.int64).reshape(0, 60)

    for user_idx, user_id in enumerate(sample_top):
        num_sample = sample_entity_stats[user_id]
        num_complete = complete_entity_stats[user_id]

        hour_entropy = entropy(sample_post_lists_hour[user_idx],
                               base=hours_in_day)
        min_entropy = entropy(sample_post_lists_min[user_idx],
                              base=minutes_in_hour)
        sec_entropy = entropy(sample_post_lists_sec[user_idx],
                              base=seconds_in_minute)
        ms10_entropy = entropy(sample_post_lists_10ms[user_idx], base=num_bins)

        min_mat = np.vstack(
            (min_mat, np.array(sample_post_lists_min[user_idx]).reshape(1,
                                                                        -1)))
        sec_mat = np.vstack(
            (sec_mat, np.array(sample_post_lists_sec[user_idx]).reshape(1,
                                                                        -1)))

        min_entropy, min_entropy_idx = min(
            (min_entropy, min_entropy_idx)
            for (min_entropy_idx, min_entropy
                 ) in enumerate([hour_entropy, min_entropy, sec_entropy]))
        if ms10_entropy < 0.87:
            min_entropy_idx = 3
        else:
            min_entropy_idx = 2
        # # if they are all very large
        # if min_entropy >= msly_entropy_benchmark:
        #     min_entropy_idx = 2

        num_estimate = sum([
            estimated_post_lists_hour[user_idx],
            estimated_post_lists_min[user_idx],
            estimated_post_lists_sec[user_idx],
            estimated_post_lists_10ms[user_idx]
        ][min_entropy_idx])
        num_estimate_list.append(num_estimate)

        num_sample_list.append(num_sample)
        num_complete_list.append(num_complete)

        ret[user_id] = (num_sample, num_complete, num_estimate,
                        min_entropy_idx)

    np.savetxt('min_sample.npy', min_mat, delimiter=',')
    np.savetxt('sec_sample.npy', sec_mat, delimiter=',')

    rank_by_sample = [
        k for k, v in sorted(
            ret.items(), key=lambda item: item[1][0], reverse=True)
    ]
    rank_by_complete = [
        k for k, v in sorted(
            ret.items(), key=lambda item: item[1][1], reverse=True)
    ]
    rank_by_estimated = [
        k for k, v in sorted(
            ret.items(), key=lambda item: item[1][2], reverse=True)
    ]

    for user_idx, user_id in enumerate(sample_top):
        print(user_id, ret[user_id][:-1],
              (rank_by_sample.index(user_id) + 1,
               rank_by_complete.index(user_id) + 1,
               rank_by_estimated.index(user_id) + 1))
        print(
            ret[user_id][0] / ret[user_id][1],
            mape(ret[user_id][1], ret[user_id][2])[0],
            rank_by_sample.index(user_id) - rank_by_complete.index(user_id),
            rank_by_estimated.index(user_id) - rank_by_complete.index(user_id))
        print(np.sum(np.array(sample_post_lists_min[user_idx]) > 0),
              np.sum(np.array(sample_post_lists_sec[user_idx]) > 0),
              np.sum(np.array(sample_post_lists_10ms[user_idx]) > 0))

    observed_top100 = rank_by_sample[:100]
    complete_rank_for_observed_top100 = [
        rank_by_complete.index(uid) + 1 for uid in observed_top100
    ]
    user_sampling_rates_for_observed_top100 = [
        sample_entity_stats[uid] / complete_entity_stats[uid]
        for uid in observed_top100
    ]
    print('kendall tau for observed',
          kendalltau(range(1, 101), complete_rank_for_observed_top100))

    estimated_top100 = rank_by_estimated[:100]
    complete_rank_for_estimated_top100 = [
        rank_by_complete.index(uid) + 1 for uid in estimated_top100
    ]
    user_sampling_rates_for_estimated_top100 = [
        sample_entity_stats[uid] / complete_entity_stats[uid]
        for uid in estimated_top100
    ]
    print('kendall tau for estimated',
          kendalltau(range(1, 101), complete_rank_for_estimated_top100))

    axes[0].scatter(range(1, 101),
                    complete_rank_for_observed_top100,
                    s=30,
                    c=user_sampling_rates_for_observed_top100,
                    edgecolors='gray',
                    vmin=0.2,
                    vmax=0.9,
                    cmap=cm,
                    zorder=50)
    axes[0].set_xlabel('observed rank in sample set', fontsize=13)
    axes[0].set_ylabel('rank in complete set', fontsize=13)
    axes[0].text(0.04,
                 0.9,
                 r"kendall's $\tau$: {0:.4f}".format(
                     kendalltau(range(1, 101),
                                complete_rank_for_observed_top100)[0]),
                 ha='left',
                 va='top',
                 size=12,
                 transform=axes[0].transAxes)
    axes[0].plot([0, 100], [100, 100], color='gray', ls='--', lw=1)
    axes[0].plot([100, 100], [0, 100], color='gray', ls='--', lw=1)
    axes[0].plot([0, 100], [0, 100], color='gray', ls='--', lw=1)
    axes[0].set_title('(a)', fontsize=13)

    sc = axes[1].scatter(range(1, 101),
                         complete_rank_for_estimated_top100,
                         s=30,
                         c=user_sampling_rates_for_estimated_top100,
                         edgecolors='gray',
                         vmin=0.2,
                         vmax=0.9,
                         cmap=cm,
                         zorder=50)
    axes[1].set_xlabel('estimated rank in sample set', fontsize=13)
    axes[1].plot([0, 100], [100, 100], color='gray', ls='--', lw=1)
    axes[1].plot([100, 100], [0, 100], color='gray', ls='--', lw=1)
    axes[1].plot([0, 100], [0, 100], color='gray', ls='--', lw=1)
    axes[1].text(0.04,
                 0.9,
                 r"kendall's $\tau$: {0:.4f}".format(
                     kendalltau(range(1, 101),
                                complete_rank_for_estimated_top100)[0]),
                 ha='left',
                 va='top',
                 size=12,
                 transform=axes[1].transAxes)
    axes[1].set_ylim(axes[0].get_ylim())
    axes[1].set_title('(b)', fontsize=13)

    cb = plt.colorbar(sc, fraction=0.055)
    cb.set_label(label='user sampling rate', size=13)
    cb.ax.tick_params(labelsize=11)

    for ax in axes[:2]:
        ax.set_xlim([-4, 104])
        ax.set_ylim(bottom=-4)
        ax.set_xticks([0, 50, 100])
        ax.set_yticks([0, 50, 100])
        ax.tick_params(axis='both', which='major', labelsize=11)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/top_entity_rank.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    start_timestamp = '1483660780000'  # January 5, 2017 23:59:40
    end_timestamp = '1483662210000'  # January 6, 2017 0:23:30
    start_time_obj = datetime.utcfromtimestamp(int(start_timestamp[:-3]))
    end_time_obj = datetime.utcfromtimestamp(int(end_timestamp[:-3]))
    total_duration = (end_time_obj - start_time_obj).seconds
    x_axis = [
        start_time_obj + timedelta(seconds=x) for x in range(total_duration)
    ]
    start_tid = str(make_snowflake(start_timestamp, 0, 0, 0))
    end_tid = str(make_snowflake(end_timestamp, 0, 0, 0))

    firehose_stats = defaultdict(int)
    firehose_cnt = 0
    firehose_tid = set()
    to_check = False
    with open('./firehose_stream.txt', 'r') as fin:
        for line in fin:
            line = line.rstrip()
            if line != '':
                if line.startswith('tweet_id'):
                    tweet_id = line.split(':')[-1]
                    to_check = True
                if to_check:
                    if line.startswith('tweet_url'):
                        parsed_url = urlparse(line.split(':')[-1])
                        if 'youtube.com' in parsed_url.netloc:
                            timestamp_ms = melt_snowflake(tweet_id)[0]
                            timestamp = datetime.utcfromtimestamp(
                                timestamp_ms // 1000)
                            firehose_stats[timestamp] += 1
                            firehose_cnt += 1
                            firehose_tid.add(tweet_id)
                            to_check = False
                    if line.startswith('text:'):
                        if 'youtube' in line.lower().split(':', 1)[-1].split():
                            timestamp_ms = melt_snowflake(tweet_id)[0]
                            timestamp = datetime.utcfromtimestamp(
                                timestamp_ms // 1000)
                            firehose_stats[timestamp] += 1
                            firehose_cnt += 1
                            firehose_tid.add(tweet_id)
                            to_check = False
    print('{0}/{1} tweets were collected via Firehose'.format(
        firehose_cnt, len(firehose_tid)))

    filtered_stats = defaultdict(int)
    filtered_cnt = 0
    ratemsg_ts_list = []
    ratemsg_track_list = []
    filtered_tid = set()
    with open('./filtered_stream.txt', 'r') as fin:
        for line in fin:
            if len(line.rstrip().split(',')) > 3:
                tweet_id = line.rstrip().split(',')[0]
                if start_tid <= tweet_id <= end_tid:
                    timestamp_ms = melt_snowflake(tweet_id)[0]
                    timestamp = datetime.utcfromtimestamp(timestamp_ms // 1000)
                    filtered_stats[timestamp] += 1
                    filtered_cnt += 1
                    filtered_tid.add(tweet_id)
            else:
                _, timestamp_ms, track = line.rstrip().split(',')
                track = int(track)
                ratemsg_ts_list.append(timestamp_ms)
                ratemsg_track_list.append(track)
    print('{0}/{1} tweets were collected via filtered stream'.format(
        filtered_cnt, len(filtered_tid)))

    split_track_lst, split_ts_lst = map_ratemsg(ratemsg_track_list,
                                                ratemsg_ts_list)
    missing_stats = {x: 0 for x in x_axis}
    last_track = 0
    for track_lst, ts_lst in zip(split_track_lst, split_ts_lst):
        for track, ts in zip(track_lst, ts_lst):
            timestamp = datetime.utcfromtimestamp(int(ts[:-3]))
            if timestamp in x_axis:
                missing_stats[timestamp] += (track - last_track)
            last_track = track

    # == == == == == == Part 5: Plot a showcase segment that is roughly 10s == == == == == == #
    cc4 = ColorPalette.CC4
    blue = cc4[0]
    red = cc4[3]
    fig, ax1 = plt.subplots(1, 1, figsize=(10, 2))

    firehose_y_axis = [
        firehose_stats[x] if x in firehose_stats else 0 for x in x_axis
    ]
    filtered_y_axis = [
        filtered_stats[x] if x in filtered_stats else 0 for x in x_axis
    ]
    missing_y_axis = [
        missing_stats[x] if x in missing_stats else 0 for x in x_axis
    ]
    estimated_y_axis = [
        filtered_y_axis[x] + missing_y_axis[x] for x in range(len(x_axis))
    ]

    ax1.plot_date(x_axis,
                  firehose_y_axis,
                  '-',
                  c='lightgrey',
                  lw=1,
                  label='firehose: {0:,}'.format(sum(firehose_y_axis)))
    ax1.plot_date(x_axis,
                  filtered_y_axis,
                  '-',
                  c=blue,
                  lw=1,
                  label='filtered: {0:,}'.format(sum(filtered_y_axis)))
    ax1.plot_date(x_axis,
                  estimated_y_axis,
                  '-',
                  c=red,
                  lw=1,
                  label='estimated: {0:,}'.format(sum(estimated_y_axis)))

    print('MAPE against Firehose volume: {0:.3f}%'.format(
        abs(sum(estimated_y_axis) - sum(firehose_y_axis)) /
        sum(firehose_y_axis) * 100))

    ax1.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    ax1.set_xlabel('Jan 06, 2017', fontsize=12)
    ax1.set_ylabel('#tweets', fontsize=12)
    ax1.tick_params(axis='both', which='major', labelsize=12)
    ax1.legend(frameon=False, loc='upper left', fontsize=12, ncol=3)

    # remove borders
    ax1.spines['right'].set_visible(False)
    ax1.spines['top'].set_visible(False)

    plt.tight_layout()
    plt.savefig('../images/SI_ratemsg_firehose.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'
    if app_name == 'cyberbullying':
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all']
    elif app_name == 'youtube':
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'all']
    else:
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all']

    os.makedirs('../data/{0}_out'.format(app_name), exist_ok=True)

    # load disconnect msg
    disconnect_dict = {k: [] for k in target_suffix}
    if os.path.exists('../log/{0}_crawl.log'.format(app_name)):
        with open('../log/{0}_crawl.log'.format(app_name), 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split()
                timestamp_ms = int(datetime.strptime('{0} {1}'.format(split_line[0], split_line[1][:-4]), '%Y-%m-%d %H:%M:%S').replace(tzinfo=timezone.utc).timestamp()) * 1000 + int(split_line[1][-3:])
                disconnect_dict[split_line[4].split('_')[1]].append(timestamp_ms)

    # ratemsg_offset = []
    # for suffix_idx, suffix in enumerate(target_suffix):
    #     suffix_dir = '{0}_{1}'.format(app_name, suffix)
    #     input_path = '../data/{0}_out/{1}.txt.bz2'.format(app_name, suffix_dir)
    #
    #     with bz2.BZ2File(input_path, mode='r') as fin:
    #         ratemsg_ts_streaming_dict = {}
    #
    #         last_timestamp_ms = 0
    #         last_ratemsg_timestamp_ms = 0
    #         has_ratemsg = False
    #
    #         for line in fin:
    #             split_line = line.decode('utf8').rstrip().split(',')
    #             if len(split_line) == 3:
    #                 last_ratemsg_timestamp_ms = int(split_line[1])
    #                 ratemsg_ts_streaming_dict[last_ratemsg_timestamp_ms] = 'ratemsg{0}-{1}'.format(suffix, split_line[2])
    #                 has_ratemsg = True
    #             else:
    #                 current_timestamp_ms = int(split_line[2])
    #
    #                 if has_ratemsg:
    #                     corrected_ratemsg_ts = (last_timestamp_ms + current_timestamp_ms) // 2
    #                     ratemsg_offset.append(last_ratemsg_timestamp_ms - corrected_ratemsg_ts)
    #                     has_ratemsg = False
    #
    #                 last_timestamp_ms = current_timestamp_ms
    #         print('>>> Loaded rate limit message timestamp offset for {0}...'.format(input_path))
    #
    # if len(ratemsg_offset) > 0:
    #     print('>>> rate limit message timestamp_ms offset')
    #     print('|  min  |  10th  |  25th  | median |  75th  |  90th  |  max  |  mean  |  std  |')
    #     print('|{0: ^7.0f}|{1: ^8.0f}|{2: ^8.0f}|{3: ^8.0f}|{4: ^8.0f}|{5: ^8.0f}|{6: ^7.0f}|{7: ^8.0f}|{8: ^7.0f}|'
    #           .format(np.min(ratemsg_offset), np.percentile(ratemsg_offset, 10), np.percentile(ratemsg_offset, 25),
    #                   np.median(ratemsg_offset), np.percentile(ratemsg_offset, 75), np.percentile(ratemsg_offset, 90),
    #                   np.max(ratemsg_offset), np.mean(ratemsg_offset), np.std(ratemsg_offset)))
    #     best_offset = int(np.mean(ratemsg_offset))
    # else:
    best_offset = 5000
    print('best rate limit message timestamp_ms offset is {0}'.format(best_offset))
    # timer.stop()

    for suffix_idx, suffix in enumerate(target_suffix):
        timer = Timer()
        timer.start()

        suffix_dir = '{0}_{1}'.format(app_name, suffix)
        input_path = '../data/{0}_out/{1}.txt.bz2'.format(app_name, suffix_dir)
        ts_output_path = '../data/{0}_out/ts_{1}.txt'.format(app_name, suffix_dir)
        user_output_path = '../data/{0}_out/user_{1}.txt'.format(app_name, suffix_dir)
        vid_output_path = '../data/{0}_out/vid_{1}.txt'.format(app_name, suffix_dir)
        mention_output_path = '../data/{0}_out/mention_{1}.txt'.format(app_name, suffix_dir)
        hashtag_output_path = '../data/{0}_out/hashtag_{1}.txt'.format(app_name, suffix_dir)
        retweet_output_path = '../data/{0}_out/retweet_{1}.txt'.format(app_name, suffix_dir)
        follower_output_path = '../data/{0}_out/follower_{1}.txt'.format(app_name, suffix_dir)

        disconnect_list = disconnect_dict[suffix]

        with bz2.BZ2File(input_path, mode='r') as fin:
            min_tweet_id = None
            visited_tid = set()
            ts_streaming_dict = {}
            user_streaming_dict = {}
            vid_streaming_dict = {}
            mention_streaming_dict = {}
            hashtag_streaming_dict = {}
            tid_retweet_dict = defaultdict(set)
            root_tweet_follower_dict = {}

            for disconnect_ts in disconnect_list:
                # make a snowflake id for disconnect message
                ts_streaming_dict[str(make_snowflake(disconnect_ts, 31, 31, suffix_idx + 1))] = 'disconnect'

            for line in fin:
                split_line = line.decode('utf8').rstrip().split(',')
                if len(split_line) == 3:
                    # make a snowflake id for rate limit message
                    ts_streaming_dict[str(make_snowflake(int(split_line[1]) - best_offset, 31, 31, suffix_idx))] = 'ratemsg{0}-{1}'.format(suffix, split_line[2])
                else:
                    tweet_id = split_line[0]
                    if min_tweet_id is None:
                        min_tweet_id = tweet_id
                    else:
                        if tweet_id < min_tweet_id:
                            min_tweet_id = tweet_id
                    if tweet_id in visited_tid:
                        continue

                    ts_streaming_dict[tweet_id] = split_line[2]

                    # root_user_id_str, reply_user_id_str, retweeted_user_id_str, quoted_user_id_str
                    user_streaming_dict[tweet_id] = '{0},{1},{2},{3}'.format(split_line[3], split_line[35], split_line[36], split_line[37])

                    to_write_vid = set()
                    for vids in [split_line[5], split_line[6], split_line[7]]:
                        if vids != 'N':
                            if ';' in vids:
                                to_write_vid.update(set(vids.split(';')))
                            else:
                                to_write_vid.add(vids)
                    if len(to_write_vid) > 0:
                        vid_streaming_dict[tweet_id] = ','.join(to_write_vid)

                    to_write_mention = set()
                    for mentions in [split_line[8], split_line[9], split_line[10]]:
                        if mentions != 'N':
                            if ';' in mentions:
                                to_write_mention.update(set(mentions.split(';')))
                            else:
                                to_write_mention.add(mentions)
                    if len(to_write_mention) > 0:
                        mention_streaming_dict[tweet_id] = ','.join(to_write_mention)

                    to_write_hashtag = set()
                    for hashtags in [split_line[11], split_line[12], split_line[13]]:
                        if hashtags != 'N':
                            if ';' in hashtags:
                                to_write_hashtag.update(set(hashtags.split(';')))
                            else:
                                to_write_hashtag.add(hashtags)
                    if len(to_write_hashtag) > 0:
                        hashtag_streaming_dict[tweet_id] = ','.join(to_write_hashtag)

                    if split_line[33] != 'N' and split_line[33] >= min_tweet_id:
                        tid_retweet_dict[split_line[33]].add('{0}-{1}'.format(tweet_id, split_line[20]))

                    if split_line[32] == 'N' and split_line[33] == 'N' and split_line[34] == 'N':
                        root_tweet_follower_dict[tweet_id] = split_line[20]

                    visited_tid.add(split_line[0])

            print('>>> Loaded all data, ready to sort and dump {0}...'.format(input_path))

            with open(ts_output_path, 'w') as fout1:
                for tid in sorted(ts_streaming_dict.keys()):
                    if ts_streaming_dict[tid].startswith('ratemsg'):
                        ts = melt_snowflake(tid)[0]
                        ratesuffix, track = ts_streaming_dict[tid].split('-')
                        fout1.write('{0},{1},{2}\n'.format(ts, ratesuffix, track))
                    elif ts_streaming_dict[tid].startswith('disconnect'):
                        ts = melt_snowflake(tid)[0]
                        fout1.write('{0},{1},{2}\n'.format(ts, 'disconnect', suffix))
                    else:
                        fout1.write('{0},{1}\n'.format(ts_streaming_dict[tid], tid))

            with open(user_output_path, 'w') as fout2:
                for tid in sorted(user_streaming_dict.keys()):
                    fout2.write('{0},{1}\n'.format(tid, user_streaming_dict[tid]))

            with open(vid_output_path, 'w') as fout3:
                for tid in sorted(vid_streaming_dict.keys()):
                    fout3.write('{0},{1}\n'.format(tid, vid_streaming_dict[tid]))

            with open(mention_output_path, 'w') as fout4:
                for tid in sorted(mention_streaming_dict.keys()):
                    fout4.write('{0},{1}\n'.format(tid, mention_streaming_dict[tid]))

            with open(hashtag_output_path, 'w', encoding='utf-8') as fout5:
                for tid in sorted(hashtag_streaming_dict.keys()):
                    fout5.write('{0},{1}\n'.format(tid, hashtag_streaming_dict[tid]))

            with open(retweet_output_path, 'w') as fout6:
                for root_tweet_id in sorted(tid_retweet_dict.keys()):
                    fout6.write('{0}:{1}\n'.format(root_tweet_id, ','.join(sorted(list(tid_retweet_dict[root_tweet_id])))))

            with open(follower_output_path, 'w') as fout7:
                for root_tweet_id in sorted(root_tweet_follower_dict.keys()):
                    fout7.write('{0},{1}\n'.format(root_tweet_id, root_tweet_follower_dict[root_tweet_id]))

        timer.stop()
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'

    hours_in_day = 24
    minutes_in_hour = 60
    seconds_in_minute = 60
    ms_in_second = 1000

    num_bins = 100
    width = ms_in_second // num_bins

    num_top = 500

    confusion_sampling_rate = np.load('../data/{0}_out/{0}_confusion_sampling_rate.npy'.format(app_name))
    confusion_sampling_rate = np.nan_to_num(confusion_sampling_rate)

    load_external_data = False
    if not load_external_data:
        sample_entity_stats = defaultdict(int)
        with open('../data/{0}_out/user_{0}_all.txt'.format(app_name), 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                sample_entity_stats[split_line[1]] += 1

        # == == == == == == Part 2: Plot entity rank == == == == == == #
        print('>>> found top {0} users in sample set...'.format(num_top))
        sample_top = [kv[0] for kv in sorted(sample_entity_stats.items(), key=lambda x: x[1], reverse=True)[:num_top]]

        # == == == == == == Part 1: Find tweets appearing in complete set == == == == == == #
        complete_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        complete_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)]
        complete_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)]
        complete_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        complete_entity_stats = defaultdict(int)
        with open('../data/{0}_out/complete_user_{0}.txt'.format(app_name), 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                user_id = split_line[1]
                if user_id in sample_top:
                    complete_entity_stats[user_id] += 1

                    user_idx = sample_top.index(user_id)
                    tweet_id = split_line[0]
                    timestamp_ms = melt_snowflake(tweet_id)[0]
                    dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000)
                    hour = dt_obj.hour
                    minute = dt_obj.minute
                    second = dt_obj.second
                    millisec = timestamp_ms % 1000
                    ms_idx = (millisec-7) // width if millisec >= 7 else (1000 + millisec-7) // width

                    complete_post_lists_hour[user_idx][hour] += 1
                    complete_post_lists_min[user_idx][minute] += 1
                    complete_post_lists_sec[user_idx][second] += 1
                    complete_post_lists_10ms[user_idx][ms_idx] += 1

        write_to_file('./complete_post_lists_hour.txt', sample_top, complete_post_lists_hour)
        write_to_file('./complete_post_lists_min.txt', sample_top, complete_post_lists_min)
        write_to_file('./complete_post_lists_sec.txt', sample_top, complete_post_lists_sec)
        write_to_file('./complete_post_lists_10ms.txt', sample_top, complete_post_lists_10ms)

        print('>>> finish dumping complete lists...')
        timer.stop()

        # == == == == == == Part 2: Find appearing tweets in sample set == == == == == == #
        sample_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        sample_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)]
        sample_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)]
        sample_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        estimated_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        estimated_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)]
        estimated_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)]
        estimated_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        hourly_conversion = np.mean(confusion_sampling_rate, axis=(1, 2, 3))
        minutey_conversion = np.mean(confusion_sampling_rate, axis=(2, 3))
        secondly_conversion = np.mean(confusion_sampling_rate, axis=(3))

        with open('../data/{0}_out/user_{0}_all.txt'.format(app_name), 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                user_id = split_line[1]
                if user_id in sample_top:
                    user_idx = sample_top.index(user_id)
                    tweet_id = split_line[0]
                    timestamp_ms = melt_snowflake(tweet_id)[0]
                    dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000)
                    hour = dt_obj.hour
                    minute = dt_obj.minute
                    second = dt_obj.second
                    millisec = timestamp_ms % 1000
                    ms_idx = (millisec-7) // width if millisec >= 7 else (1000 + millisec-7) // width

                    sample_post_lists_hour[user_idx][hour] += 1
                    sample_post_lists_min[user_idx][minute] += 1
                    sample_post_lists_sec[user_idx][second] += 1
                    sample_post_lists_10ms[user_idx][ms_idx] += 1

                    estimated_post_lists_hour[user_idx][hour] += 1 / hourly_conversion[hour]
                    estimated_post_lists_min[user_idx][minute] += 1 / minutey_conversion[hour, minute]
                    estimated_post_lists_sec[user_idx][second] += 1 / secondly_conversion[hour, minute, second]
                    estimated_post_lists_10ms[user_idx][ms_idx] += 1 / confusion_sampling_rate[hour, minute, second, ms_idx]

        write_to_file('./sample_post_lists_hour.txt', sample_top, sample_post_lists_hour)
        write_to_file('./sample_post_lists_min.txt', sample_top, sample_post_lists_min)
        write_to_file('./sample_post_lists_sec.txt', sample_top, sample_post_lists_sec)
        write_to_file('./sample_post_lists_10ms.txt', sample_top, sample_post_lists_10ms)

        write_to_file('./estimated_post_lists_hour.txt', sample_top, estimated_post_lists_hour)
        write_to_file('./estimated_post_lists_min.txt', sample_top, estimated_post_lists_min)
        write_to_file('./estimated_post_lists_sec.txt', sample_top, estimated_post_lists_sec)
        write_to_file('./estimated_post_lists_10ms.txt', sample_top, estimated_post_lists_10ms)

        print('>>> finish dumping sample and estimated lists...')
        timer.stop()
    else:
        sample_top = []
        complete_post_lists_hour = []
        with open('./complete_post_lists_hour.txt', 'r') as fin:
            for line in fin:
                user_id, total, records = line.rstrip().split('\t')
                sample_top.append(user_id)
                records = list(map(int, records.split(',')))
                complete_post_lists_hour.append(records)

        complete_post_lists_min = read_from_file('./complete_post_lists_min.txt', dtype=0)
        complete_post_lists_sec = read_from_file('./complete_post_lists_sec.txt', dtype=0)
        complete_post_lists_10ms = read_from_file('./complete_post_lists_10ms.txt', dtype=0)

        sample_post_lists_hour = read_from_file('./sample_post_lists_hour.txt', dtype=0)
        sample_post_lists_min = read_from_file('./sample_post_lists_min.txt', dtype=0)
        sample_post_lists_sec = read_from_file('./sample_post_lists_sec.txt', dtype=0)
        sample_post_lists_10ms = read_from_file('./sample_post_lists_10ms.txt', dtype=0)

        estimated_post_lists_hour = read_from_file('./estimated_post_lists_hour.txt', dtype=1)
        estimated_post_lists_min = read_from_file('./estimated_post_lists_min.txt', dtype=1)
        estimated_post_lists_sec = read_from_file('./estimated_post_lists_sec.txt', dtype=1)
        estimated_post_lists_10ms = read_from_file('./estimated_post_lists_10ms.txt', dtype=1)

    # == == == == == == Part 3: Find the best estimation by comparing JS distance == == == == == == #
    ret = {}
    num_estimate_list = []
    num_sample_list = []
    num_complete_list = []

    sample_entity_stats = {user_id: sum(sample_post_lists_hour[user_idx]) for user_idx, user_id in enumerate(sample_top)}
    complete_entity_stats = {user_id: sum(complete_post_lists_hour[user_idx]) for user_idx, user_id in enumerate(sample_top)}

    min_mat = np.array([], dtype=np.int64).reshape(0, 60)
    sec_mat = np.array([], dtype=np.int64).reshape(0, 60)

    for user_idx, user_id in enumerate(sample_top):
        num_sample = sample_entity_stats[user_id]
        num_complete = complete_entity_stats[user_id]

        hour_entropy = entropy(sample_post_lists_hour[user_idx], base=hours_in_day)
        min_entropy = entropy(sample_post_lists_min[user_idx], base=minutes_in_hour)
        sec_entropy = entropy(sample_post_lists_sec[user_idx], base=seconds_in_minute)
        ms10_entropy = entropy(sample_post_lists_10ms[user_idx], base=num_bins)

        min_mat = np.vstack((min_mat, np.array(sample_post_lists_min[user_idx]).reshape(1, -1)))
        sec_mat = np.vstack((sec_mat, np.array(sample_post_lists_sec[user_idx]).reshape(1, -1)))

        if ms10_entropy < 0.87:
            min_entropy_idx = 3
        else:
            min_entropy_idx = 0

        num_estimate = sum([estimated_post_lists_hour[user_idx], estimated_post_lists_min[user_idx],
                            estimated_post_lists_sec[user_idx], estimated_post_lists_10ms[user_idx]][min_entropy_idx])
        num_estimate_list.append(num_estimate)

        num_sample_list.append(num_sample)
        num_complete_list.append(num_complete)

        ret[user_id] = (num_sample, num_complete, num_estimate, min_entropy_idx)

    # == == == == == == Part 3: Plot case users == == == == == == #
    case_user_ids = ['1033778124968865793', '1182605743335211009']
    case_user_screennames = ['WeltRadio', 'bensonbersk']

    fig, axes = plt.subplots(1, 2, figsize=(7.2, 2.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]
    red = cc4[3]
    filled_colors = [blue, red]
    labels = ['(c)', '(d)']
    for ax_idx, user_id in enumerate(case_user_ids):
        user_idx = sample_top.index(user_id)
        min_entropy_idx = ret[user_id][-1]

        if min_entropy_idx == 0:
            axes[ax_idx].bar(range(hours_in_day), complete_post_lists_hour[user_idx], color='lightgray', width=1)
            axes[ax_idx].bar(range(hours_in_day), sample_post_lists_hour[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1)
            axes[ax_idx].plot(range(hours_in_day), estimated_post_lists_hour[user_idx], 'k-', lw=1.5)
            axes[ax_idx].set_xlabel('hour', fontsize=12)
            axes[ax_idx].set_xlim([-1, hours_in_day+1])
            axes[ax_idx].set_xticks([0, 6, 12, 18, 24])
            axes[ax_idx].set_title('{0} {1}'.format(labels[ax_idx], case_user_screennames[ax_idx]), fontsize=13)
        elif min_entropy_idx == 1:
            axes[ax_idx].bar(range(minutes_in_hour), complete_post_lists_min[user_idx], color='lightgray', width=1)
            axes[ax_idx].bar(range(minutes_in_hour), sample_post_lists_min[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1)
            axes[ax_idx].plot(range(minutes_in_hour), estimated_post_lists_min[user_idx], 'k-', lw=1.5)
            axes[ax_idx].set_xlabel('minute', fontsize=12)
            axes[ax_idx].set_xlim([-1, minutes_in_hour+1])
            axes[ax_idx].set_xticks([0, 15, 30, 45, 60])
        elif min_entropy_idx == 2:
            axes[ax_idx].bar(range(seconds_in_minute), complete_post_lists_sec[user_idx], color='lightgray', width=1)
            axes[ax_idx].bar(range(seconds_in_minute), sample_post_lists_sec[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1)
            axes[ax_idx].plot(range(seconds_in_minute), estimated_post_lists_sec[user_idx], 'k-', lw=1.5)
            axes[ax_idx].set_xlabel('second', fontsize=12)
            axes[ax_idx].set_xlim([-1, seconds_in_minute+1])
            axes[ax_idx].set_xticks([0, 15, 30, 45, 60])
        elif min_entropy_idx == 3:
            axes[ax_idx].bar(range(num_bins), complete_post_lists_10ms[user_idx], color='lightgray', width=1)
            axes[ax_idx].bar(range(num_bins), sample_post_lists_10ms[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1)
            axes[ax_idx].plot(range(num_bins), estimated_post_lists_10ms[user_idx], 'k-', lw=1.5)
            axes[ax_idx].set_xlabel('millisecond', fontsize=12)
            axes[ax_idx].set_xlim([-3, num_bins+3])
            axes[ax_idx].set_xticks([0, 25, 50, 75, 100])
            axes[ax_idx].xaxis.set_major_formatter(FuncFormatter(lambda x, _: 10*x))

        axes[ax_idx].tick_params(axis='both', which='major', labelsize=11)
        axes[ax_idx].set_title('{0} {1}'.format(labels[ax_idx], case_user_screennames[ax_idx]), fontsize=13)

    axes[0].set_ylabel('volume', fontsize=12)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/suspicious_users.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
示例#7
0
def main():
    app_name = 'covid'
    if app_name == 'cyberbullying':
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all']
    elif app_name == 'youtube' or app_name == 'covid':
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'all']
    else:
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all']

    archive_dir = '../data/{0}_out'.format(app_name)
    best_offset = 5000

    # merge timestamps
    timer = Timer()
    timer.start()

    for suffix_idx, suffix in enumerate(target_suffix):
        print('>>> Merging suffix {0}_{1}...'.format(app_name, suffix))
        visited_tid = set()
        ts_streaming_dict = {}
        for subdir, _, files in os.walk(os.path.join(archive_dir, '{0}_{1}'.format(app_name, suffix), 'timestamp')):
            for f in sorted(files):
                with open(os.path.join(subdir, f), 'r') as fin:
                    for line in fin:
                        split_line = line.rstrip().split(',')
                        if len(split_line) == 3:
                            ts_streaming_dict[str(make_snowflake(int(split_line[0]) - best_offset, 31, 31, suffix_idx))] = 'ratemsg{0},{1}'.format(suffix, split_line[2])
                        else:
                            tweet_id = split_line[1]
                            if tweet_id not in visited_tid:
                                ts_streaming_dict[tweet_id] = split_line[0]
                                visited_tid.add(tweet_id)

        with bz2.open(os.path.join(archive_dir, '{0}_{1}/ts_{0}_{1}.bz2'.format(app_name, suffix)), 'wt') as ts_output:
            for tid in sorted(ts_streaming_dict.keys()):
                if ts_streaming_dict[tid].startswith('ratemsg'):
                    ts = melt_snowflake(tid)[0]
                    ts_output.write('{0},{1}\n'.format(ts, ts_streaming_dict[tid]))
                else:
                    ts_output.write('{0},{1}\n'.format(ts_streaming_dict[tid], tid))
        print('>>> Finishing merging suffix {0}_{1}'.format(app_name, suffix))

    print('>>> Merging complete stream for {0}...'.format(app_name))
    inputfile_list = ['{0}_{1}/ts_{0}_{1}.bz2'.format(app_name, suffix) for suffix in target_suffix]
    inputfile_handles = [bz2.BZ2File(os.path.join(archive_dir, inputfile), mode='r') for inputfile in inputfile_list]
    visited_item_set = set()

    with bz2.open(os.path.join(archive_dir, 'complete_ts_{0}.bz2'.format(app_name)), 'wt') as ts_output:
        nextline_list = [inputfile.readline().decode('utf-8') for inputfile in inputfile_handles]

        while True:
            next_idx, next_item, end_flag = find_next_item(nextline_list)
            if end_flag:
                break
            # omit rate limit messages in the all crawler
            if 'ratemsg' not in next_item and next_item not in visited_item_set:
                ts_output.write(next_item)
                visited_item_set.add(next_item)
            nextline_list[next_idx] = inputfile_handles[next_idx].readline().decode('utf-8')

    for inputfile in inputfile_handles:
        inputfile.close()
    print('>>> Finishing merging complete stream for {0}'.format(app_name))

    timer.stop()