예제 #1
0
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'

    complete_user_id_set = set()
    with open('../data/{0}_out/complete_user_{0}.txt'.format(app_name),
              'r') as fin:
        for line in fin:
            tid, root_uid, _ = line.rstrip().split(',', 2)
            complete_user_id_set.add(root_uid)

    embed_uid_dict = {
        'u{0}'.format(embed): uid
        for embed, uid in enumerate(sorted(list(complete_user_id_set)))
    }
    num_user_complete = len(embed_uid_dict)
    print('{0} users appear in the complete set'.format(num_user_complete))

    with open('../networks/{0}_embed_user.txt'.format(app_name), 'w') as fout:
        for uid in sorted(embed_uid_dict.keys()):
            fout.write('{0},{1}\n'.format(uid, embed_uid_dict[uid]))

    print('>>> Finish embedding users')
    timer.stop()

    complete_hashtag_id_set = set()
    with open('../data/{0}_out/complete_hashtag_{0}.txt'.format(app_name),
              'r',
              encoding='utf-8') as fin:
        for line in fin:
            tid, *hashtags = line.rstrip().lower().split(',')
            complete_hashtag_id_set.update(hashtags)

    embed_hid_dict = {
        'h{0}'.format(embed): hashtag
        for embed, hashtag in enumerate(sorted(list(complete_hashtag_id_set)))
    }
    num_hashtag_complete = len(embed_hid_dict)
    print(
        '{0} hashtags appear in the complete set'.format(num_hashtag_complete))

    with open('../networks/{0}_embed_hashtag.txt'.format(app_name),
              'w',
              encoding='utf-8') as fout:
        for hid in sorted(embed_hid_dict.keys()):
            fout.write('{0},{1}\n'.format(hid, embed_hid_dict[hid]))

    print('>>> Finish embedding hashtags')
    timer.stop()
예제 #2
0
def main():
    timer = Timer()
    timer.start()

    cc4 = ColorPalette.CC4

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    start_idx = 21000
    end_idx = 25500
    timestamp_list = []
    track_list = []

    with open('rate_limit_2015-09-08.txt', 'r') as fin:
        for line in fin:
            rate_json = json.loads(line.rstrip())
            track = rate_json['limit']['track']
            track_list.append(track)
            timestamp = datetime.utcfromtimestamp(
                (int(rate_json['limit']['timestamp_ms'][:-3])))
            timestamp_list.append(timestamp)

    axes[0].scatter(timestamp_list[start_idx:end_idx],
                    track_list[start_idx:end_idx],
                    c='k',
                    s=0.4)
    axes[0].set_xlim([timestamp_list[start_idx], timestamp_list[end_idx]])
    axes[0].xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    axes[0].set_xticks(axes[0].get_xticks()[::2])
    axes[0].set_xlabel('Sep 08, 2015', fontsize=16)
    axes[0].set_ylabel('value', fontsize=16)
    axes[0].yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001)

    print('start timestamp', timestamp_list[start_idx])
    print('end timestamp', timestamp_list[end_idx])
    split_track_lst, split_ts_lst = map_ratemsg(
        track_list[start_idx:end_idx], timestamp_list[start_idx:end_idx])
    total_miss = 0
    for track_lst, ts_lst, color in zip(split_track_lst, split_ts_lst, cc4):
        axes[1].scatter(ts_lst, track_lst, c=color, s=0.4)
        total_miss += (track_list[-1] - track_list[0])
    print('{0} tweets are missing'.format(total_miss))
    axes[1].set_xlim([timestamp_list[start_idx], timestamp_list[end_idx]])
    axes[1].xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S'))
    axes[1].set_xticks(axes[1].get_xticks()[::2])
    axes[1].set_xlabel('Sep 08, 2015', fontsize=16)
    axes[1].yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/SI_ratemsg_coloring.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
예제 #3
0
def main():
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    fig, axes = plt.subplots(ncols=3, nrows=3, figsize=(12, 4), sharex='col')
    gs = axes[0, 0].get_gridspec()
    for ax in axes[:, 0]:
        ax.remove()
    ax_left = fig.add_subplot(gs[:, 0])
    ax_left.set_axis_off()
    ax_left.spines['top'].set_visible(False)
    ax_left.spines['right'].set_visible(False)
    ax_left.spines['bottom'].set_visible(False)
    ax_left.spines['left'].set_visible(False)

    ax_left.set_title('(a)', fontsize=12)
    axes = axes[:, 1:].ravel()

    video_title_list = ['Hello', 'Someone like you', 'Rolling in the deep',
                        'Skyfall', 'Set fire to the rain', 'Hometown glory']

    # == == == == == == Part 1: Load data == == == == == == #
    fig_idx = 0
    with open(os.path.join(data_prefix, 'teaser.json'), 'r') as fin:
        for line in fin:
            video_json = json.loads(line.rstrip())
            daily_view = video_json['insights']['dailyView']
            end_date = datetime.strptime(video_json['insights']['endDate'], '%Y-%m-%d')
            start_date = end_date - timedelta(days=len(daily_view))
            date_axis = [start_date + timedelta(days=t) for t in range(len(daily_view))]

            # plot daily view series
            axes[fig_idx].plot_date(date_axis, daily_view, 'k-')

            axes[fig_idx].axvline(x=datetime(2015, 10, 23), color=ColorPalette.TOMATO, linestyle='--', lw=1.5, zorder=30)
            axes[fig_idx].text(0.3, 0.95, video_title_list[fig_idx], size=10,
                               transform=axes[fig_idx].transAxes, ha='center', va='bottom')
            axes[fig_idx].tick_params(axis='both', which='major', labelsize=10)
            axes[fig_idx].yaxis.set_major_formatter(FuncFormatter(concise_fmt))
            axes[fig_idx].xaxis.set_major_formatter(mdates.DateFormatter("'%y"))

            fig_idx += 1

    axes[2].set_ylabel('daily views', fontsize=11)
    axes[0].set_title('(b)', fontsize=12)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/intro_teaser.pdf', bbox_inches='tight')
    plt.show()
예제 #4
0
def main():
    timer = Timer()
    timer.start()

    consumer_key = conf.twitter_consumer_key
    consumer_secret = conf.twitter_consumer_secret
    access_token = conf.twitter_access_token
    access_token_secret = conf.twitter_access_secret

    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = API(auth)

    num_media = 0
    num_found_twitter = 0
    num_found_youtube = 0
    num_fail = 0

    app_name = 'mbfc'

    with open('data/{0}/{0}_ratings_v3.csv'.format(app_name), 'w') as fout:
        with open('data/{0}/{0}_ratings_v2.csv'.format(app_name), 'r') as fin:
            fout.write('{0},{1},{2},{3},{4}\n'.format(fin.readline().rstrip(),
                                                      'TWHandle', 'TWSim',
                                                      'YTId', 'YTUser'))
            for line in fin:
                num_media += 1
                head, website_url = line.rstrip().rsplit(',', 1)
                try:
                    tw_handle, tw_sim, yt_id, yt_user = crawl_social_media_from_url(
                        website_url, api)
                    fout.write('{0},{1},{2},{3},{4}\n'.format(
                        line.rstrip(), tw_handle, tw_sim, yt_id, yt_user))
                    print(
                        'crawled accounts: {0:>10} | {1:>10} | {2:>10}'.format(
                            tw_handle, yt_id, yt_user))
                    if tw_handle != '':
                        num_found_twitter += 1
                    if yt_id != '' or yt_user != '':
                        num_found_youtube += 1
                    if isinstance(tw_sim, float) and tw_sim < 0.5:
                        print('+++ Twitter handle to be reviewed: {0} {1:.4f}'.
                              format(tw_handle, tw_sim))
                except:
                    num_fail += 1
                    continue
                print(
                    '>>> {0}/{2} twitter handles are found; {1}/{2} youtube ids'
                    .format(num_found_twitter, num_found_youtube, num_media))

    print('in total, {0} media websites not accessible'.format(num_fail))

    timer.stop()
def main():
    # == == == == == == == == Part 1: Set up experiment parameters == == == == == == == == #
    total_start_time = time.time()

    data_prefix = '../data/'
    forecast_filepath = 'vevo_forecast_data_60k.tsv'
    recsys_dirpath = 'recsys'
    snapshot_dirpath = 'network_pickle'

    if not os.path.exists(os.path.join(data_prefix, snapshot_dirpath)):
        os.mkdir(os.path.join(data_prefix, snapshot_dirpath))

    # == == == == == == Part 2: Load vevo en videos 61k dataset == == == == == == #
    vid_embed_dict = {}
    vid_view_dict = {}
    with open(os.path.join(data_prefix, forecast_filepath), 'r') as fin:
        for line in fin:
            embed, vid, ts_view, total_view = line.rstrip().split('\t')
            vid_embed_dict[vid] = int(embed)
            ts_view = np.array(intify(ts_view.split(',')))
            vid_view_dict[vid] = ts_view
    vevo_en_vid_list = list(sorted(vid_embed_dict.keys()))
    num_videos = len(vevo_en_vid_list)

    for t in range(T):
        timer = Timer()
        timer.start()

        target_date_str = obj2str(datetime(2018, 9, 1) + timedelta(days=t))
        recsys_filepath = 'recsys_{0}.json'.format(target_date_str)
        snapshot_filepath = 'network_{0}.p'.format(target_date_str)
        network_mat = {embed: [] for embed in range(num_videos)}

        with open(os.path.join(data_prefix, recsys_dirpath, recsys_filepath), 'r') as fin:
            for line in fin:
                network_json = json.loads(line.rstrip())
                source = network_json['vid']
                targets = network_json['relevant_list'][: MAX_POSITION]
                for position, target in enumerate(targets):
                    if target in vevo_en_vid_list:
                        # add embedding of incoming video and position of target video on source video
                        network_mat[vid_embed_dict[target]].append((vid_embed_dict[source], position, vid_view_dict[source][t]))

        with open(os.path.join(data_prefix, snapshot_dirpath, snapshot_filepath), 'wb') as fout:
            pickle.dump(network_mat, fout)

        print('>>> Finish dumping date {0}'.format(target_date_str))
        timer.stop()

    print('>>> Network structure has been dumped!')
    print('>>> Total elapsed time: {0}\n'.format(str(timedelta(seconds=time.time() - total_start_time))[:-3]))
예제 #6
0
def main():
    timer = Timer()
    timer.start()

    n = 63
    # change to 10,000 for faster computing
    num_sim = 10000

    with open('./justify_persistent_link.log', 'w') as fout:
        for p in np.arange(0, 1.01, 0.01):
            fout.write('p_form: {0:.2f}, p_persistent_link: {1:.4f}\n'.format(
                p, simulate_for_prob(p, n, num_sim)))
            print('>>> Finish simulating at prob {0:.2f}'.format(p))

    timer.stop()
def main():
    # == == == == == == Part 1: Set up experiment parameters == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'
    vevo_en_videos_path = 'vevo_en_videos_60k.json'
    vevo_forecast_filepath = 'vevo_forecast_data_60k.tsv'
    vevo_embed_filepath = 'vevo_en_embeds_60k.txt'

    # == == == == == == Part 2: Load Vevo en forecast data == == == == == == #
    vevo_en_vid_list = []
    vid_title_dict = {}
    vid_forecast_view_dict = {}

    with open(os.path.join(data_prefix, vevo_en_videos_path), 'r') as fin:
        for line in fin:
            video_json = json.loads(line.rstrip())
            vid = video_json['id']
            vevo_en_vid_list.append(vid)

            title = (video_json['snippet']['title'].encode(
                'ascii', 'ignore')).decode('utf-8')
            vid_title_dict[vid] = title

            daily_view = video_json['insights']['dailyView']
            forecast_view = daily_view[-T:]
            vid_forecast_view_dict[vid] = forecast_view

    vevo_en_vid_list = sorted(vevo_en_vid_list)
    num_videos = len(vevo_en_vid_list)

    with open(os.path.join(data_prefix, vevo_forecast_filepath), 'w') as fout:
        for embed in range(num_videos):
            vid = vevo_en_vid_list[embed]
            forecast_view = vid_forecast_view_dict[vid]
            fout.write('{0}\t{1}\t{2}\t{3}\n'.format(
                embed, vid, strify(forecast_view, delimiter=','),
                np.sum(forecast_view)))

    with open(os.path.join(data_prefix, vevo_embed_filepath),
              'w',
              encoding='utf-8') as fout:
        for embed in range(num_videos):
            vid = vevo_en_vid_list[embed]
            fout.write('{0},{1},{2}\n'.format(embed, vid, vid_title_dict[vid]))

    timer.stop()
예제 #8
0
def main():
    timer = Timer()
    timer.start()

    input_filepath = 'data/mbfc/to_crawl_users.csv'
    output_filepath = 'data/mbfc/active_user_subscription.json.bz2'

    visited_channel_set = set()
    if os.path.exists(output_filepath):
        with bz2.BZ2File(output_filepath, 'r') as fin:
            for line in fin:
                line = line.decode('utf-8')
                channel_id = json.loads(line.rstrip())['channel_id']
                visited_channel_set.add(channel_id)
    print('visited {0} channels in the past, continue...'.format(
        len(visited_channel_set)))

    num_user = len(visited_channel_set)
    with bz2.open(output_filepath, 'at') as fout:
        with open(input_filepath, 'r') as fin:
            for line in fin:
                user_id = line.rstrip().split(',')[0]
                if user_id not in visited_channel_set:
                    num_request = 0
                    found = False
                    print('get description and subscriptions for user {0}'.
                          format(user_id))
                    while num_request < 5:
                        try:
                            profile_json = get_subscriptions_from_channel(
                                user_id, target='subscription')
                            found = True
                        except:
                            num_request += 1

                        if found:
                            fout.write('{0}\n'.format(
                                json.dumps(profile_json)))
                            num_user += 1
                            print(
                                '{0} subscriptions are obtained for user {1}: {2}\n'
                                .format(len(profile_json['subscriptions']),
                                        num_user, user_id))
                            time.sleep(1)
                            break

    timer.stop()
def main():
    app_name = 'covid'
    if app_name == 'cyberbullying':
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all']
    elif app_name == 'youtube' or app_name == 'covid':
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', 'all']
    else:
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all']

    archive_dir = '../data/{0}_out'.format(app_name)

    timer = Timer()
    timer.start()

    est_num_tweet = 0

    for suffix in target_suffix:
        num_tweet = 0
        num_ratemsg = 0
        track_list = []
        with bz2.BZ2File(os.path.join(archive_dir, '{0}_{1}/ts_{0}_{1}.bz2'.format(app_name, suffix)), mode='r') as fin:
            for line in fin:
                line = line.decode('utf-8')
                if 'ratemsg' in line:
                    num_ratemsg += 1
                    track_list.append(int(line.rstrip().split(',')[2]))
                else:
                    num_tweet += 1
        num_miss = count_track(track_list)
        subcrawler_sampling_rate = num_tweet / (num_tweet + num_miss)
        print('>>> subcrawler {0}_{1: <3}, {2: >9d} retrieved tweets, {3: >7d} rate limit track, indicating {4: >9d} missing tweets, yielding {5: >6.2f}% sampling rate'
              .format(app_name, suffix, num_tweet, num_ratemsg, num_miss, 100 * subcrawler_sampling_rate))
        if suffix == 'all':
            est_num_tweet = num_tweet + num_miss

    gt_num_tweet = 0
    with bz2.BZ2File(os.path.join(archive_dir, 'complete_ts_{0}.bz2'.format(app_name)), mode='r') as fin:
        for _ in fin:
            gt_num_tweet += 1
    gt_sampling_rate = gt_num_tweet / est_num_tweet
    print('>>> complete_set {0}  , {1: >9d} retrieved tweets, {2: >7} rate limit track, estimating {3: >9d}   total tweets, yielding {4: >6.2f}% sampling rate'
          .format(app_name, gt_num_tweet, 'NaN', est_num_tweet, 100 * gt_sampling_rate))

    timer.stop()
def main():
    timer = Timer()
    timer.start()

    app_name = 'mbfc'

    input_filepath = 'data/{0}/{0}_ratings.csv'.format(app_name)
    output_filepath = 'data/{0}/MBFC_featured_channels.json'.format(app_name)

    with open(output_filepath, 'w') as fout:
        with open(input_filepath, 'r') as fin:
            fin.readline()
            for line in fin:
                _, channel_id, _, _, is_political = line.rstrip().rsplit(
                    ',', 4)
                if is_political == 'Y':
                    num_request = 0
                    found = False
                    print('get featured channels for channel {0}'.format(
                        channel_id))
                    while num_request < 5:
                        try:
                            profile_json = get_subscriptions_from_channel(
                                channel_id, target='featured')
                            found = True
                        except:
                            num_request += 1

                        if found:
                            if len(profile_json['featured_channels']) > 0:
                                fout.write('{0}\n'.format(
                                    json.dumps(profile_json)))
                                print(json.dumps(profile_json))
                                print(
                                    '{0} featured channels are obtained for channel {1}\n'
                                    .format(
                                        len(profile_json['featured_channels']),
                                        channel_id))
                            time.sleep(1)
                            break

    timer.stop()
def main():
    app_name = 'cyberbullying'
    if app_name == 'cyberbullying':
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all']
    elif app_name == 'youtube':
        target_suffix = [
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
            'all'
        ]
    else:
        target_suffix = ['1', '2', '3', '4', '5', '6', '7', '8', 'all']

    archive_dir = '../data/{0}_out'.format(app_name)

    timer = Timer()
    timer.start()

    print('>>> Merging user profile')

    fout = open(
        os.path.join(archive_dir,
                     'complete_user_profile_{0}.txt'.format(app_name)), 'w')
    visited_user_id_str = set()
    num_users = 0
    for suffix in target_suffix:
        with bz2.BZ2File(os.path.join(
                archive_dir, '{0}_{1}_user.txt.bz2'.format(app_name, suffix)),
                         mode='r') as fin:
            for line in fin:
                line = line.decode('utf8')
                user_id_str, _ = line.rstrip().split(',', 1)
                if user_id_str not in visited_user_id_str:
                    fout.write(line)
                    visited_user_id_str.add(user_id_str)
                    num_users += 1
    print('>>> We retrieve profiles for {0} users'.format(num_users))
    fout.close()

    timer.stop()
def main():
    timer = Timer()
    timer.start()

    input_filepath = 'data/mbfc/to_crawl_vid.txt'
    output_filepath = 'data/mbfc/MBFC_video_metadata.json.bz2'

    visited_video_set = set()
    if os.path.exists(output_filepath):
        with bz2.BZ2File(output_filepath, mode='r') as fin:
            for line in fin:
                video_json = json.loads(line.rstrip())
                if 'vid' in video_json:
                    visited_video_set.add(video_json['vid'])
    print('visited {0} videos in the past, continue...'.format(len(visited_video_set)))

    num_video = len(visited_video_set)
    total_num_request = 0
    with bz2.open(output_filepath, 'at') as fout:
        with open(input_filepath, 'r') as fin:
            for line in fin:
                video_id = line.rstrip()
                if video_id not in visited_video_set:
                    try:
                        video_metadata = get_video_metadata(video_id)
                        total_num_request += video_metadata.pop('num_request', 0)
                        if len(video_metadata) > 0:
                            visited_video_set.add(video_id)
                            fout.write('{0}\n'.format(json.dumps(video_metadata)))
                            num_video += 1
                        else:
                            print('xxx error, failed in crawling metadata for video {0}'.format(video_id))
                    except Exception as e:
                        print(str(e))
                        break
                    print('>>> so far crawled {0} videos, {1} requests are sent'.format(num_video, total_num_request))

    print('>>> reach file end!')
    timer.stop()
def main():
    timer = Timer()
    timer.start()

    app_name = 'mbfc'

    current_date = datetime.now().strftime('%Y-%m-%d')
    input_filepath = 'data/{0}/{0}_ratings.csv'.format(app_name)
    output_filepath = 'data/{0}/{0}_video_ids_{1}.json'.format(app_name, current_date)
    visited_channel_set = set()
    if os.path.exists(output_filepath):
        with open(output_filepath, 'r') as fin:
            for line in fin:
                visited_channel_set.add(json.loads(line.rstrip())['channel_id'])
    print('visited {0} channels in the past, continue...'.format(len(visited_channel_set)))

    idx_media = len(visited_channel_set)
    with open(output_filepath, 'a') as fout:
        with open(input_filepath, 'r') as fin:
            fin.readline()
            for line in fin:
                channel_id = line.rstrip().split(',')[-1]
                if channel_id != '' and channel_id not in visited_channel_set:
                    print('get videos for media {0}'.format(channel_id))
                    upload_playlist = 'UU' + channel_id[2:]
                    num_fail = 0
                    while num_fail < 5:
                        try:
                            channel_video_ids = get_videos_from_playlist(upload_playlist)
                            break
                        except:
                            num_fail += 1

                    fout.write('{0}\n'.format(json.dumps({'channel_id': channel_id, 'playlist': channel_video_ids})))
                    visited_channel_set.add(channel_id)
                    idx_media += 1
                    print('{0} video ids are obtained for media {1}: {2}\n'.format(len(channel_video_ids), idx_media, channel_id))

    timer.stop()
예제 #14
0
def main():
    timer = Timer()
    timer.start()

    youtube_key = conf.youtube_key
    parts = 'id'

    yt_crawler = YTCrawler()
    yt_crawler.set_key(youtube_key)

    app_name = 'mbfc'

    with open('data/{0}/{0}_ratings_v6.csv'.format(app_name), 'w') as fout:
        with open('data/{0}/{0}_ratings_v5.csv'.format(app_name), 'r') as fin:
            fout.write(fin.readline())
            for line in fin:
                title, tail = line.rstrip().split(',', 1)
                middle, yt_id, yt_user = tail.rsplit(',', 2)
                if yt_id == '' and yt_user == '':
                    fout.write(line)
                elif yt_id != '':
                    yt_id = yt_crawler.check_channel_id(yt_id, parts)
                    if yt_id == '':
                        print('--- Channel id crawler failed on title {0}'.
                              format(title))
                    fout.write('{0},{1},{2},{3}\n'.format(
                        title, middle, yt_id, yt_user))
                else:
                    yt_id = yt_crawler.get_channel_id(yt_user, parts)
                    if yt_id == '':
                        print('--- Channel id crawler failed on title {0}'.
                              format(title))
                    fout.write('{0},{1},{2},{3}\n'.format(
                        title, middle, yt_id, yt_user))

    timer.stop()
def main():
    timer = Timer()
    timer.start()

    app_names = ['cyberbullying', 'youtube']
    # data for plot subfig (a)
    showcase_segment_idx = 0
    showcase_complete_tid_list = []
    showcase_retrieved_tid_list = []
    showcase_ratemsg_list = []
    showcase_track_list = []

    # data for plot subfig (b)
    mape_dict = {app_name: [] for app_name in app_names}

    rate_silence_length = 10000
    disconnect_silence_length = 180000
    print('>>> We silence {0} seconds around rate limit messages'.format(
        rate_silence_length // 1000))
    print('>>> We silence {0} seconds proceeding disconnect messages\n'.format(
        disconnect_silence_length // 1000))

    for app_name in app_names:
        print('>>> Computing on app {0}'.format(app_name))
        archive_dir = './{0}_out'.format(app_name)

        sample_input_path = os.path.join(archive_dir,
                                         'ts_{0}_all.txt'.format(app_name))
        complete_input_path = os.path.join(
            archive_dir, 'complete_ts_{0}.txt'.format(app_name))

        # == == == == == == Part 1: Initially select segments in the complete set == == == == == == #
        # segments that silence 10s around rate limit messages and 180s proceeding disconnect messages in complete set
        init_segment_list = []
        init_start_ts = 0
        with open(complete_input_path, 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                # if it is a disconnect msg
                if 'disconnect' in split_line[1]:
                    disconnect_ts = int(split_line[0])
                    # disconnect message, remove the proceeding [disconnect_silence_length]
                    init_end_ts = disconnect_ts - disconnect_silence_length
                    if init_end_ts > init_start_ts:
                        init_segment_list.append((init_start_ts, init_end_ts,
                                                  init_end_ts - init_start_ts))
                    init_start_ts = disconnect_ts
                # elif it is a rate limit msg
                elif 'ratemsg' in split_line[1]:
                    ratemsg_ts = int(split_line[0])
                    # rate limit message, remove the surrounding [rate_silence_length]
                    init_end_ts = ratemsg_ts - rate_silence_length // 2
                    if init_end_ts > init_start_ts:
                        init_segment_list.append((init_start_ts, init_end_ts,
                                                  init_end_ts - init_start_ts))
                    init_start_ts = ratemsg_ts + rate_silence_length // 2
        print(
            '>>> Initially, we identify {0} segments in complete set without rate limit message'
            .format(len(init_segment_list)))
        # print(init_segment_list[: 10])

        # == == == == == == Part 2: Segments are bounded by 2 rate limit messages in the sample set == == == == == == #
        bounded_segment_list = []
        current_segment_idx = 0
        current_start_ts = 0
        current_ratemsg_list = []
        current_track_list = []
        last_ratemsg_ts = 0

        look_for_end = False
        found_showcase = False

        with open(sample_input_path, 'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                if 'ratemsg' in split_line[1]:
                    ratemsg_ts = int(split_line[0])
                    track = int(split_line[2])
                    if not look_for_end or (
                            look_for_end
                            and current_start_ts == last_ratemsg_ts
                            and init_segment_list[current_segment_idx][1] <
                            ratemsg_ts):
                        # fast forward, skip some really short segments
                        while ratemsg_ts >= init_segment_list[
                                current_segment_idx][1]:
                            current_segment_idx += 1
                        if current_segment_idx == len(init_segment_list):
                            break
                        if ratemsg_ts >= init_segment_list[
                                current_segment_idx][0]:
                            current_start_ts = ratemsg_ts
                            current_ratemsg_list = [ratemsg_ts]
                            current_track_list = [track]
                            look_for_end = True
                        else:
                            look_for_end = False
                    elif look_for_end:
                        if current_start_ts < last_ratemsg_ts <= init_segment_list[
                                current_segment_idx][1] < ratemsg_ts:
                            current_num_miss = count_track(
                                current_track_list,
                                start_with_rate=True,
                                subcrawler=False)
                            bounded_segment_list.append(
                                (current_start_ts, last_ratemsg_ts,
                                 last_ratemsg_ts - current_start_ts,
                                 current_num_miss))

                            # find the first example segment that is around 11 sec long
                            if app_name == 'cyberbullying' and not found_showcase and 10000 <= last_ratemsg_ts - current_start_ts <= 12000:
                                showcase_segment_idx = len(
                                    bounded_segment_list) - 1
                                showcase_ratemsg_list = copy.deepcopy(
                                    current_ratemsg_list)
                                showcase_track_list = copy.deepcopy(
                                    current_track_list)
                                found_showcase = True

                            current_segment_idx += 1
                            if current_segment_idx == len(init_segment_list):
                                break
                            if ratemsg_ts >= init_segment_list[
                                    current_segment_idx][0]:
                                current_start_ts = ratemsg_ts
                                current_ratemsg_list = [ratemsg_ts]
                                current_track_list = [track]
                                look_for_end = True
                            else:
                                look_for_end = False
                        else:
                            current_ratemsg_list.append(ratemsg_ts)
                            current_track_list.append(track)

                    last_ratemsg_ts = ratemsg_ts
                    if current_segment_idx == len(init_segment_list):
                        break
        print('>>> We further bound {0} segments with 2 rate limit messages'.
              format(len(bounded_segment_list)))
        # print(bounded_segment_list[-10:])

        # == == == == == == Part 3: Add sample and complete volume in each segment == == == == == == #
        for input_path, tid_list in zip(
            [sample_input_path, complete_input_path],
            [showcase_retrieved_tid_list, showcase_complete_tid_list]):
            current_segment_idx = 0
            current_segment_cnt = 0
            with open(input_path, 'r') as fin:
                for line in fin:
                    split_line = line.rstrip().split(',')
                    if len(split_line) == 2:
                        msg_ts = int(split_line[0])
                        if bounded_segment_list[current_segment_idx][
                                0] < msg_ts <= bounded_segment_list[
                                    current_segment_idx][1]:
                            current_segment_cnt += 1

                            if app_name == 'cyberbullying' and current_segment_idx == showcase_segment_idx:
                                tweet_id = split_line[1]
                                tid_list.append(tweet_id)
                        elif msg_ts > bounded_segment_list[
                                current_segment_idx][1]:
                            bounded_segment_list[current_segment_idx] = (
                                *bounded_segment_list[current_segment_idx],
                                current_segment_cnt)
                            current_segment_idx += 1
                            current_segment_cnt = 0
                            if current_segment_idx == len(
                                    bounded_segment_list):
                                break
            # print(bounded_segment_list[-10:])

        length_tracker = 0
        mape_list = []
        for segment in bounded_segment_list:
            length_tracker += segment[2]
            mape_list.append(mape(segment[-1], segment[-2] + segment[-3]))
        mape_dict[app_name] = copy.deepcopy(mape_list)
        print('MAPE: {0:.5f} +- {1:.5f}, median: {2:.5f}'.format(
            np.mean(mape_list), np.std(mape_list), np.median(mape_list)))
        print('total tracked days bounded: {0:.2f} out of 14'.format(
            length_tracker / 1000 / 60 / 60 / 24))

        if app_name == 'cyberbullying':
            print(
                'complete tweets: {0}, retrieved tweets: {1}, estimated missing: {2}'
                .format(
                    len(showcase_complete_tid_list),
                    len(showcase_retrieved_tid_list),
                    count_track(showcase_track_list,
                                start_with_rate=True,
                                subcrawler=False)))
            print('ratemsg timestamp', showcase_ratemsg_list)
            print('ratemsg track', showcase_track_list)
        print()

    timer.stop()

    # == == == == == == Part 5: Plot a showcase segment that is roughly 10s == == == == == == #
    cc4 = ColorPalette.CC4
    blue = cc4[0]
    green = cc4[1]
    red = cc4[3]
    fig, axes = plt.subplots(1, 4, figsize=(12, 1.6))
    ax2 = axes[-1]
    gs = axes[1].get_gridspec()
    for ax in axes[:-1]:
        ax.remove()
    ax1 = fig.add_subplot(gs[:-1])

    # add a timeline
    ax1.axhline(0, linewidth=2, color='k')

    observed_tweet_ts_list = sorted(
        [melt_snowflake(tid)[0] for tid in showcase_retrieved_tid_list])
    showcase_missing_tid_set = set(showcase_complete_tid_list).difference(
        set(showcase_retrieved_tid_list))
    missing_tweet_ts_list = sorted(
        [melt_snowflake(tid)[0] for tid in showcase_missing_tid_set])
    ax1.scatter(observed_tweet_ts_list, [1] * len(observed_tweet_ts_list),
                marker='o',
                facecolors='none',
                edgecolors=blue,
                lw=1,
                s=20)
    ax1.scatter(missing_tweet_ts_list, [0.5] * len(missing_tweet_ts_list),
                marker='x',
                c='k',
                lw=1,
                s=20)
    # stats for missing tweets, cut by rate limit msg timestamp_ms
    complete_track_list = []
    i, j, curr_cnt = 0, 1, 0
    while i < len(missing_tweet_ts_list) and j < len(showcase_ratemsg_list):
        if missing_tweet_ts_list[i] <= showcase_ratemsg_list[j]:
            curr_cnt += 1
            i += 1
        else:
            complete_track_list.append(curr_cnt)
            curr_cnt = 0
            j += 1
    complete_track_list.append(curr_cnt)
    # print(complete_track_list)

    for idx, ts in enumerate(showcase_ratemsg_list):
        ax1.axvline(ts, ymin=0, ymax=1.1, linewidth=1, color='k')

    for idx, ts in enumerate(showcase_ratemsg_list[1:]):
        ax1.text(ts - 50,
                 0.42,
                 '/{0:>3}'.format(complete_track_list[idx]),
                 color='k',
                 ha='right',
                 va='top',
                 size=10)
        ax1.text(ts - 470,
                 0.42,
                 str(showcase_track_list[idx + 1] - showcase_track_list[idx]),
                 color=green,
                 ha='right',
                 va='top',
                 size=10)

    ax1.xaxis.set_major_formatter(FuncFormatter(to_datetime))
    ax1.set_xlim(left=showcase_ratemsg_list[0] - 200,
                 right=showcase_ratemsg_list[-1] + 200)
    ax1.set_yticks([0.5, 1.0])
    ax1.set_ylim(top=1.2, bottom=0)
    num_missing_by_counting = len(showcase_complete_tid_list) - len(
        showcase_retrieved_tid_list)
    num_missing_by_estimating = count_track(showcase_track_list,
                                            start_with_rate=True,
                                            subcrawler=False)
    num_observed_tweets = len(showcase_retrieved_tid_list)
    ax1.tick_params(axis='x', which='major', labelsize=10)
    ax1.tick_params(axis='y', which='both', length=0)
    ax1.set_yticklabels([
        'missing tweets\n{0}/{1}'.format(num_missing_by_estimating,
                                         num_missing_by_counting),
        'collected tweets\n{0}'.format(num_observed_tweets)
    ],
                        fontsize=10)

    # remove borders
    ax1.spines['right'].set_visible(False)
    ax1.spines['left'].set_visible(False)
    ax1.spines['top'].set_visible(False)
    ax1.spines['bottom'].set_visible(False)
    ax1.set_title('(a)', fontsize=11, pad=-1.35 * 72, y=1.0001)

    bplot = ax2.boxplot([mape_dict['cyberbullying'], mape_dict['youtube']],
                        labels=['Cyberbullying', 'YouTube'],
                        widths=0.5,
                        showfliers=False,
                        showmeans=False,
                        patch_artist=True)

    for patch, color in zip(bplot['boxes'], [blue, red]):
        patch.set_facecolor(color)

    for median in bplot['medians']:
        median.set(color='k', linewidth=1)

    ax2.tick_params(axis='both', which='major', labelsize=10)
    ax2.set_ylabel('MAPE', fontsize=10)
    ax2.spines['right'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax2.set_title('(b)', fontsize=11, pad=-1.35 * 72, y=1.0001)

    plt.tight_layout(rect=[0, 0.03, 1, 1])
    plt.savefig('../images/validate_ratemsg.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'

    sample_cascade_size = {}
    sample_inter_arrival_time = []
    sample_cascade_influence = {}
    sample_cascade_influence_10m = defaultdict(int)
    sample_cascade_influence_1h = defaultdict(int)
    with open('../data/{0}_out/sample_retweet_{0}.txt'.format(app_name),
              'r') as fin:
        for line in fin:
            root_tweet, cascades = line.rstrip().split(':')
            cascades = cascades.split(',')
            root_tweet = root_tweet.split('-')[0]
            retweets = [x.split('-')[0] for x in cascades]
            influences = [int(x.split('-')[1]) for x in cascades]
            sample_cascade_size[root_tweet] = len(retweets)
            sample_cascade_influence[root_tweet] = sum(influences)
            root_timestamp = melt_snowflake(root_tweet)[0] / 1000
            retweet_timestamp_list = [root_timestamp]

            for i in range(len(retweets)):
                retweet_time = melt_snowflake(retweets[i])[0] / 1000
                relative_retweet_time = retweet_time - root_timestamp
                retweet_timestamp_list.append(
                    melt_snowflake(retweets[i])[0] / 1000)
                if relative_retweet_time < 10 * 60:
                    sample_cascade_influence_10m[root_tweet] += influences[i]
                if relative_retweet_time < 60 * 60:
                    sample_cascade_influence_1h[root_tweet] += influences[i]

            for i in range(len(retweet_timestamp_list) - 1):
                sample_inter_arrival_time.append(retweet_timestamp_list[i +
                                                                        1] -
                                                 retweet_timestamp_list[i])

    complete_cascade_size = {}
    complete_inter_arrival_time = []
    complete_cascade_influence = {}
    complete_cascade_influence_10m = defaultdict(int)
    complete_cascade_influence_1h = defaultdict(int)
    with open('../data/{0}_out/complete_retweet_{0}.txt'.format(app_name),
              'r') as fin:
        for line in fin:
            root_tweet, cascades = line.rstrip().split(':')
            cascades = cascades.split(',')
            root_tweet = root_tweet.split('-')[0]
            retweets = [x.split('-')[0] for x in cascades]
            complete_cascade_size[root_tweet] = len(retweets)
            if len(retweets) >= 50:
                influences = [int(x.split('-')[1]) for x in cascades]
                complete_cascade_influence[root_tweet] = sum(influences)
                root_timestamp = melt_snowflake(root_tweet)[0] / 1000
                retweet_timestamp_list = [root_timestamp]

                for i in range(len(retweets)):
                    retweet_time = melt_snowflake(retweets[i])[0] / 1000
                    relative_retweet_time = retweet_time - root_timestamp
                    retweet_timestamp_list.append(
                        melt_snowflake(retweets[i])[0] / 1000)
                    if relative_retweet_time < 10 * 60:
                        complete_cascade_influence_10m[
                            root_tweet] += influences[i]
                    if relative_retweet_time < 60 * 60:
                        complete_cascade_influence_1h[
                            root_tweet] += influences[i]

                for i in range(len(retweet_timestamp_list) - 1):
                    complete_inter_arrival_time.append(
                        retweet_timestamp_list[i + 1] -
                        retweet_timestamp_list[i])

    print('number of cascades in the complete set', len(complete_cascade_size))
    print('number of cascades in the sample set', len(sample_cascade_size))

    print('mean complete size', np.mean(list(complete_cascade_size.values())))
    print('mean sample size', np.mean(list(sample_cascade_size.values())))

    print('complete #cascades (≥50 retweets)',
          sum([1 for x in list(complete_cascade_size.values()) if x >= 50]))
    print('sample #cascades (≥50 retweets)',
          sum([1 for x in list(sample_cascade_size.values()) if x >= 50]))

    num_complete_cascades_in_sample = 0
    complete_cascades_in_sample_size_list = []
    num_complete_cascades_in_sample_50 = 0
    for root_tweet in sample_cascade_size:
        if sample_cascade_size[root_tweet] == complete_cascade_size[
                root_tweet]:
            num_complete_cascades_in_sample += 1
            complete_cascades_in_sample_size_list.append(
                complete_cascade_size[root_tweet])
            if complete_cascade_size[root_tweet] >= 50:
                num_complete_cascades_in_sample_50 += 1
    print('number of complete cascades in the sample set',
          num_complete_cascades_in_sample)
    print('number of complete cascades (>50 retweets) in the sample set',
          num_complete_cascades_in_sample_50)
    print('max: {0}, mean: {1}'.format(
        max(complete_cascades_in_sample_size_list),
        np.mean(complete_cascades_in_sample_size_list)))

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    cc4 = ColorPalette.CC4
    blue = cc4[0]
    red = cc4[3]

    sample_median = np.median(sample_inter_arrival_time)
    complete_median = np.median(complete_inter_arrival_time)

    plot_ccdf(sample_inter_arrival_time,
              ax=axes[0],
              color=blue,
              ls='-',
              label='sample')
    plot_ccdf(complete_inter_arrival_time,
              ax=axes[0],
              color='k',
              ls='-',
              label='complete')

    axes[0].plot([sample_median, sample_median], [0, 1],
                 color=blue,
                 ls='--',
                 lw=1)
    axes[0].plot([complete_median, complete_median], [0, 1],
                 color='k',
                 ls='--',
                 lw=1)

    print('\ninter_arrival_time sample median', sample_median)
    print('inter_arrival_time complete median', complete_median)

    axes[0].set_xscale('symlog')
    axes[0].set_xticks([0, 1, 100, 10000, 1000000])
    axes[0].set_yscale('linear')
    axes[0].set_xlabel('inter-arrival time (sec)', fontsize=16)
    axes[0].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[0].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='upper right')
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', fontsize=18, pad=-3 * 72, y=1.0001)

    influence_list = []
    influence_list_10m = []
    influence_list_1h = []
    for root_tweet in sample_cascade_size:
        if complete_cascade_size[root_tweet] >= 50:
            if complete_cascade_influence[root_tweet] > 0:
                influence_list.append(sample_cascade_influence[root_tweet] /
                                      complete_cascade_influence[root_tweet])
            if complete_cascade_influence_10m[root_tweet] > 0:
                influence_list_10m.append(
                    sample_cascade_influence_10m[root_tweet] /
                    complete_cascade_influence_10m[root_tweet])
            if complete_cascade_influence_1h[root_tweet] > 0:
                influence_list_1h.append(
                    sample_cascade_influence_1h[root_tweet] /
                    complete_cascade_influence_1h[root_tweet])

    plot_ccdf(influence_list_10m, ax=axes[1], color=red, ls='-', label='10m')
    plot_ccdf(influence_list_1h, ax=axes[1], color=blue, ls='-', label='1h')
    plot_ccdf(influence_list, ax=axes[1], color='k', ls='-', label='14d')

    print('influence_list median', np.median(influence_list))
    print('influence_list_1h median', np.median(influence_list_1h))
    print('influence_list_10m median', np.median(influence_list_10m))

    print('influence_list 0.25', percentileofscore(influence_list, 0.25))
    print('influence_list 0.25', percentileofscore(influence_list_1h, 0.25))
    print('influence_list 0.25', percentileofscore(influence_list_10m, 0.25))

    print('influence_list 0.75', percentileofscore(influence_list, 0.75))
    print('influence_list 0.75', percentileofscore(influence_list_1h, 0.75))
    print('influence_list 0.75', percentileofscore(influence_list_10m, 0.75))

    axes[1].set_xscale('linear')
    axes[1].set_yscale('linear')
    axes[1].set_xlabel('relative potential reach', fontsize=16)
    # axes[1].set_ylabel('$P(X \geq x)$', fontsize=16)
    axes[1].legend(frameon=False,
                   fontsize=16,
                   ncol=1,
                   fancybox=False,
                   shadow=True,
                   loc='upper right')
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].set_title('(b)', fontsize=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/cascades_measures.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/recsys'

    num_relevant_by_rank = np.zeros((NUM_REL,))
    num_recommended_by_rank = np.zeros((NUM_REC,))

    # aggregate by rank1, rank2-5, rank6-10, rank11-15
    dense_relevant_in_recommended_mat = np.zeros((NUM_REL, 4))
    # aggregate by rank1, rank2-5, rank6-10, rank11-15, rank16-30, rank31-50
    dense_recommended_from_relevant_mat = np.zeros((NUM_REC, 6))

    relevant_in_recommended_arr = np.zeros((NUM_REL,))
    recommended_from_relevant_arr = np.zeros((NUM_REC,))

    # == == == == == == Part 2: Load both relevant list and recommended list == == == == == == #
    for subdir, _, files in os.walk(data_prefix):
        for f in files:
            with open(os.path.join(subdir, f), 'r') as fin:
                for line in fin:
                    network_json = json.loads(line.rstrip())
                    recommended_list = network_json['recommended_list'][: NUM_REC]
                    relevant_list = network_json['relevant_list'][: NUM_REL]

                    num_relevant_by_rank += np.pad(np.ones(len(relevant_list)), (0, NUM_REL - len(relevant_list)), 'constant')
                    num_recommended_by_rank += np.pad(np.ones(len(recommended_list)), (0, NUM_REC - len(recommended_list)), 'constant')

                    for rel_rank, vid in enumerate(relevant_list):
                        if vid in recommended_list:
                            relevant_in_recommended_arr[rel_rank] += 1
                            position_on_recommended = recommended_list.index(vid)
                            dense_relevant_in_recommended_mat[rel_rank, switch(position_on_recommended)] += 1

                    for rec_rank, vid in enumerate(recommended_list):
                        if vid in relevant_list:
                            recommended_from_relevant_arr[rec_rank] += 1
                            position_on_relevant = relevant_list.index(vid)
                            dense_recommended_from_relevant_mat[rec_rank, switch(position_on_relevant)] += 1

    # == == == == == == Part 3: Plot probabilities in each position == == == == == == #
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes = axes.ravel()
    color_cycle_6 = ColorPalette.CC6

    stackedBarPlot(ax=axes[0], data=dense_relevant_in_recommended_mat / num_relevant_by_rank.reshape(-1, 1),
                   cols=color_cycle_6,
                   edgeCols=['#000000'] * 4,
                   xlabel='position $x$ on relevant list',
                   ylabel='prob. of displaying on recommended list',
                   scale=False,
                   endGaps=True)

    axes[0].legend([plt.Rectangle((0, 0), 1, 1, fc=color_cycle_6[x], alpha=0.8, ec='k') for x in range(4)],
                   ['position 1', 'position 2-5', 'position 6-10', 'position 11-15'], fontsize=10,
                   frameon=False,
                   loc='upper right', fancybox=False, shadow=True, ncol=1)
    axes[0].set_title('(a)', fontsize=12)

    stackedBarPlot(ax=axes[1], data=dense_recommended_from_relevant_mat / num_recommended_by_rank.reshape(-1, 1),
                   cols=ColorPalette.CC6,
                   edgeCols=['#000000'] * 6,
                   xlabel='position $x$ on recommended list',
                   ylabel='prob. of originating from relevant list',
                   scale=False,
                   endGaps=True)

    axes[1].legend([plt.Rectangle((0, 0), 1, 1, fc=color_cycle_6[x], alpha=0.8, ec='k') for x in range(6)],
                   ['position 1', 'position 2-5', 'position 6-10', 'position 11-15', 'position 16-30', 'position 31-50'], fontsize=10,
                   frameon=False,
                   loc='upper right', fancybox=False, shadow=True, ncol=2)

    axes[1].set_title('(b)', fontsize=12)

    for ax in axes:
        ax.set_ylim(top=1)
        ax.set_ylim(bottom=0)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/data_rel2rec.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    n_cluster = 6

    for date_type in ['sample', 'complete']:
        uid_hid_stats = pickle.load(
            open('./{0}_uid_hid_stats.p'.format(date_type), 'rb'))
        hid_uid_stats = pickle.load(
            open('./{0}_hid_uid_stats.p'.format(date_type), 'rb'))

        num_users = len(uid_hid_stats)
        num_hashtags = len(hid_uid_stats)
        print('in {0} set, {1} users, {2} hashtags'.format(
            date_type, num_users, num_hashtags))

        all_graph = {
            uid: [x[0] for x in lst[1:]]
            for uid, lst in uid_hid_stats.items()
        }
        rev_all_graph = {
            hid: [x[0] for x in lst[1:]]
            for hid, lst in hid_uid_stats.items()
        }
        all_graph.update(rev_all_graph)

        all_bipartites = tarjan(all_graph)
        all_bipartites = sorted(all_bipartites,
                                key=lambda x: len(x),
                                reverse=True)
        print('number of bipartites: {0}'.format(len(all_bipartites)))

        largest_bipartite = all_bipartites[0]
        largest_bipartite_users = [
            x for x in largest_bipartite if x.startswith('u')
        ]
        largest_bipartite_hashtags = [
            x for x in largest_bipartite if x.startswith('h')
        ]
        largest_bipartite_num_users = len(largest_bipartite_users)
        largest_bipartite_num_hashtags = len(largest_bipartite_hashtags)
        print(
            'components of largest bipartite: {0} users; {1} hashtags'.format(
                largest_bipartite_num_users, largest_bipartite_num_hashtags))

        # B = nx.Graph()
        # # Add edges only between nodes of opposite node sets
        # bipartite_edges = []
        # for uid in largest_bipartite_users:
        #     for hid, cnt in uid_hid_stats[uid]:
        #         bipartite_edges.append((uid, hid, {'weight': cnt}))
        # B.add_edges_from(bipartite_edges)

        # re-embed
        new_user_embed = {
            uid: embed
            for embed, uid in enumerate(sorted(largest_bipartite_users))
        }
        new_embed_user = {v: k for k, v in new_user_embed.items()}
        new_hashtag_embed = {
            hid: embed
            for embed, hid in enumerate(sorted(largest_bipartite_hashtags))
        }
        new_embed_hashtag = {v: k for k, v in new_hashtag_embed.items()}

        bipartite_edges = {}
        for uid in largest_bipartite_users:
            bipartite_edges[new_user_embed[uid]] = []
            for hid, _ in uid_hid_stats[uid][1:]:
                bipartite_edges[new_user_embed[uid]].append(
                    new_hashtag_embed[hid])
        row, col = [], []
        for key, item in bipartite_edges.items():
            row += [key] * len(item)
            col += item
        biadjacency = sparse.csr_matrix((np.ones(len(row),
                                                 dtype=int), (row, col)))

        print('built the biadjacency')

        bispectral = BiSpectralClustering(n_clusters=n_cluster)
        print('running BiSpectralClustering...')
        bispectral.fit(biadjacency)
        print('completed BiSpectralClustering...')
        row_labels = bispectral.row_labels_
        col_labels = bispectral.col_labels_
        clusters = [[] for _ in range(n_cluster)]
        for user_idx, label in enumerate(row_labels):
            clusters[label].append(new_embed_user[user_idx])
        for hashtag_idx, label in enumerate(col_labels):
            clusters[label].append(new_embed_hashtag[hashtag_idx])
        for i in range(n_cluster):
            print('cluster {0}, size: {1}, num_user: {2}, num_hashtag: {3}'.
                  format(i, len(clusters[i]),
                         len([x for x in clusters[i] if x.startswith('u')]),
                         len([x for x in clusters[i] if x.startswith('h')])))
            with open('./{0}_cluster{1}.txt'.format(date_type, i),
                      'w') as fout:
                fout.write(','.join(clusters[i]))
예제 #19
0
def main():
    # == == == == == == Part 1: Set up environment == == == == == == #
    timer = Timer()
    timer.start()

    data_prefix = '../data/'

    # == == == == == == Part 2: Load video views == == == == == == #
    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_avg_view_dict = data_loader.embed_avg_view_dict
    num_videos = data_loader.num_videos

    # == == == == == == Part 3: Build views percentile partition == == == == == == #
    day_views = list(embed_avg_view_dict.values())
    median_value = np.median(day_views)
    # the top 1st quantile is 75th percentile and above
    first_quantile_value = np.percentile(day_views, 75)
    third_quantile_value = np.percentile(day_views, 25)

    embed_percentile_dict = {}
    for embed in np.arange(num_videos):
        if embed_avg_view_dict[embed] >= first_quantile_value:
            embed_percentile_dict[embed] = 0
        elif embed_avg_view_dict[embed] >= median_value:
            embed_percentile_dict[embed] = 1
        elif embed_avg_view_dict[embed] >= third_quantile_value:
            embed_percentile_dict[embed] = 2
        else:
            embed_percentile_dict[embed] = 3

    # == == == == == == Part 4: Load dynamic network snapshot == == == == == == #
    edge_weight_mat = np.zeros((4, 4), dtype=np.float32)
    for t in range(T):
        filename = 'network_{0}.p'.format(
            (datetime(2018, 9, 1) + timedelta(days=t)).strftime('%Y-%m-%d'))
        with open(os.path.join(data_prefix, 'network_pickle', filename),
                  'rb') as fin:
            network_dict = pickle.load(fin)
            # embed_tar: [(embed_src, pos_src, view_src), ...]
            for embed_tar in range(num_videos):
                for embed_src, pos_src, _ in network_dict[embed_tar]:
                    if pos_src < NUM_REL:
                        edge_weight_mat[(
                            embed_percentile_dict[embed_src],
                            embed_percentile_dict[embed_tar])] += 1 / T
        print('>>> Finish loading day {0}...'.format(t + 1))
    edge_weight_mat = edge_weight_mat.astype(np.int)
    print('>>> Network structure has been loaded!')

    # == == == == == == Part 5: Plot graph by network2tikz == == == == == == #
    # Network
    # -------
    # every possible pair, including self loop
    network_structure = []
    num_partitions = 4
    for pair in itertools.product(np.arange(num_partitions), repeat=2):
        network_structure.append(pair)
    net = igraph.Graph(network_structure, directed=True)

    # Network attributes
    # ------------------
    # Network dicts
    # -------------
    layout = {0: (0, 0), 1: (1, 0), 2: (2, 0), 3: (3, 0)}

    # Visual style dict
    # -----------------
    visual_style = {}

    # node styles
    # -----------
    visual_style['vertex_size'] = 0.9
    visual_style['vertex_color'] = ColorPalette.CCRGB4
    visual_style['vertex_opacity'] = 0.6
    visual_style['vertex_label'] = [
        'top 25\%', '(25\% 50\%', '(50\% 75\%', 'bottom 25\%'
    ]
    visual_style['vertex_label_distance'] = 0
    visual_style['vertex_label_size'] = [5, 4, 4, 4]

    # edge styles
    # -----------
    edge_width = list(np.ravel(edge_weight_mat))
    visual_style['edge_width'] = scaler(edge_width)
    visual_style['edge_curved'] = 0.7
    edge_label = ['{{{:,}}}'.format(x) for x in edge_width]
    visual_style['edge_label'] = edge_label
    visual_style['edge_label_size'] = 4.5
    visual_style['edge_loop_shape'] = 60
    visual_style['edge_loop_size'] = 1
    visual_style['edge_loop_position'] = [180, 0, 0, 0]
    visual_style['edge_arrow_size'] = 0.01
    visual_style['edge_arrow_width'] = [
        0.03, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01, 0.01,
        0.02, 0.01, 0.01, 0.01
    ]

    # general options
    # ---------------
    visual_style['layout'] = layout
    visual_style['canvas'] = (10, 3.5)
    visual_style['margin'] = 1.5

    # Create pdf figure of the network
    plot(net, '../images/measure_how_videos_connect.pdf', **visual_style)
    print('>>> Generated pdf file ../images/measure_how_videos_connect.pdf')

    timer.stop()
def main():
    timer = Timer()
    timer.start()

    cornflower_blue = ColorPalette.BLUE
    tomato = ColorPalette.TOMATO
    color_cycle_4 = ColorPalette.CC4
    label_fs = ColorPalette.LABELFS
    title_fs = ColorPalette.TITLEFS
    tick_style = ColorPalette.TICKSTYLE
    bar_text_style = ColorPalette.BARTEXTSTYLE

    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_train_view_dict = {
        embed: np.mean(embed_view_dict[embed][:-NUM_OUTPUT])
        for embed in embed_view_dict.keys()
    }

    net_ratio_list = []

    src_to_tar_view_ratio = []
    link_weights_record = []

    naive_smape_list, snaive_smape_list, ar_smape_list, rnn_smape_list, arnet_smape_list = [
        [] for _ in range(5)
    ]
    naive_daily_smape_mat, snaive_daily_smape_mat, ar_daily_smape_mat, rnn_daily_smape_mat, arnet_daily_smape_mat = [
        np.empty((0, NUM_OUTPUT), np.float) for _ in range(5)
    ]

    with open('./forecast_tracker_all.json', 'r') as fin:
        for line in fin:
            result_json = json.loads(line.rstrip())
            tar_embed = result_json['embed']

            true_value = result_json['true_value']
            naive_pred = result_json['naive_pred']
            snaive_pred = result_json['snaive_pred']
            ar_pred = result_json['ar_pred']
            rnn_pred = result_json['rnn_pred']
            arnet_pred = result_json['arnet_pred']

            naive_smape, naive_daily_smape_arr = smape(true_value, naive_pred)
            naive_smape_list.append(naive_smape)
            naive_daily_smape_mat = np.vstack(
                (naive_daily_smape_mat, naive_daily_smape_arr))

            snaive_smape, snaive_daily_smape_arr = smape(
                true_value, snaive_pred)
            snaive_smape_list.append(snaive_smape)
            snaive_daily_smape_mat = np.vstack(
                (snaive_daily_smape_mat, snaive_daily_smape_arr))

            ar_smape, ar_daily_smape_arr = smape(true_value, ar_pred)
            ar_smape_list.append(ar_smape)
            ar_daily_smape_mat = np.vstack(
                (ar_daily_smape_mat, ar_daily_smape_arr))

            rnn_smape, rnn_daily_smape_arr = smape(true_value, rnn_pred)
            rnn_smape_list.append(rnn_smape)
            rnn_daily_smape_mat = np.vstack(
                (rnn_daily_smape_mat, rnn_daily_smape_arr))

            arnet_smape, arnet_daily_smape_arr = smape(true_value, arnet_pred)
            arnet_smape_list.append(arnet_smape)
            arnet_daily_smape_mat = np.vstack(
                (arnet_daily_smape_mat, arnet_daily_smape_arr))

            # analyse network contribution
            arnet_net_ratio = result_json['net_ratio']
            net_ratio_list.append(arnet_net_ratio)

            incoming_embeds = result_json['incoming_embeds']
            link_weights = result_json['link_weights']
            for edge_inx, src_embed in enumerate(incoming_embeds):
                view_ratio = np.log10(embed_avg_train_view_dict[src_embed] /
                                      embed_avg_train_view_dict[tar_embed])
                src_to_tar_view_ratio.append(view_ratio)
                link_weights_record.append(link_weights[edge_inx])

    fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(12, 4))
    axes = axes.ravel()

    # == == == == == == Part 1: Plot performance comparison == == == == == == #
    smape_mat = [
        naive_smape_list, snaive_smape_list, ar_smape_list, rnn_smape_list,
        arnet_smape_list
    ]
    axes[0].boxplot(smape_mat,
                    showfliers=False,
                    meanline=True,
                    showmeans=True,
                    widths=0.7)
    means = [np.mean(x) for x in smape_mat]
    pos = range(len(means))
    for tick, label in zip(pos, axes[1].get_xticklabels()):
        axes[0].text(pos[tick] + 1, means[tick] + 0.3,
                     '{0:.3f}'.format(means[tick]), **bar_text_style)

    axes[0].set_xticklabels(['Naive', 'SN', 'AR', 'RNN', 'ARNet'],
                            fontsize=label_fs)
    axes[0].set_ylabel('SMAPE', fontsize=label_fs)
    axes[0].tick_params(**tick_style)
    axes[0].set_title('(a)', fontsize=title_fs)

    # == == == == == == Part 2: Plot performance with forecast horizon extends == == == == == == #
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(naive_daily_smape_mat, axis=0),
                 label='Naive',
                 c='k',
                 mfc='none',
                 marker='D',
                 markersize=4)
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(snaive_daily_smape_mat, axis=0),
                 label='SN',
                 c=color_cycle_4[0],
                 mfc='none',
                 marker='*',
                 markersize=5)
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(ar_daily_smape_mat, axis=0),
                 label='AR',
                 c=color_cycle_4[1],
                 mfc='none',
                 marker='s',
                 markersize=5)
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(rnn_daily_smape_mat, axis=0),
                 label='RNN',
                 c=color_cycle_4[2],
                 mfc='none',
                 marker='^',
                 markersize=5)
    axes[1].plot(np.arange(1, 1 + NUM_OUTPUT),
                 np.mean(arnet_daily_smape_mat, axis=0),
                 label='ARNet',
                 c=color_cycle_4[3],
                 marker='o',
                 markersize=5)

    axes[1].set_xlabel('forecast horizon', fontsize=label_fs)
    axes[1].set_ylabel('SMAPE', fontsize=label_fs)
    axes[1].set_ylim([6, 23])
    axes[1].tick_params(**tick_style)
    axes[1].legend(frameon=False)
    axes[1].set_title('(b)', fontsize=title_fs)

    # == == == == == == Part 3: Plot link strength vs. view ratio from src to tar == == == == == == #
    bin_axis = np.arange(-2, 1.9, 0.1)
    bin_records = [[] for _ in range(len(bin_axis))]
    for x, y in zip(src_to_tar_view_ratio, link_weights_record):
        if x >= -2:
            bin_records[int(np.floor((x + 2) * 10))].append(y)

    for t in np.arange(5, 50, 5):
        axes[2].fill_between(bin_axis,
                             [np.percentile(x, 50 - t) for x in bin_records],
                             [np.percentile(x, 55 - t) for x in bin_records],
                             facecolor=cornflower_blue,
                             alpha=(100 - 2 * t) / 100,
                             lw=0)
        axes[2].fill_between(bin_axis,
                             [np.percentile(x, 45 + t) for x in bin_records],
                             [np.percentile(x, 50 + t) for x in bin_records],
                             facecolor=cornflower_blue,
                             alpha=(100 - 2 * t) / 100,
                             lw=0)

    for t in [10, 30, 70, 90]:
        axes[2].plot(bin_axis, [np.percentile(x, t) for x in bin_records],
                     color=cornflower_blue,
                     alpha=(100 - 2 * t) / 100,
                     lw=1,
                     zorder=15)
    median_line = [np.percentile(x, 50) for x in bin_records]
    axes[2].plot(bin_axis,
                 median_line,
                 color='k',
                 alpha=0.5,
                 zorder=20,
                 lw=1.5)
    axes[2].xaxis.set_major_formatter(
        FuncFormatter(lambda x, _: r'$10^{{{0:.0f}}}%$'.format(x)))

    peak1_idx = int(np.argmax(median_line))
    peak2_idx = 10 + int(np.argmax(median_line[10:]))
    peak1 = (bin_axis[peak1_idx], median_line[peak1_idx])
    peak2 = (bin_axis[peak2_idx], median_line[peak2_idx])
    axes[2].scatter(peak1[0],
                    peak1[1],
                    s=15,
                    c=tomato,
                    edgecolors='k',
                    zorder=30)
    axes[2].text(peak1[0] + 0.08,
                 peak1[1] + 0.01,
                 '({0:.2f}, {1:.2f})'.format(10**peak1[0], peak1[1]),
                 ha='left',
                 va='center')
    axes[2].scatter(peak2[0],
                    peak2[1],
                    s=15,
                    c=tomato,
                    edgecolors='k',
                    zorder=30)
    axes[2].text(peak2[0],
                 peak2[1] + 0.02,
                 '({0:.2f}, {1:.2f})'.format(10**peak2[0], peak2[1]),
                 ha='center',
                 va='bottom')

    axes[2].set_xlim((-2.05, 2.02))
    axes[2].set_ylim((-0.02, 1.01))
    axes[2].set_xlabel('views ratio from video ' + r'$u$' + ' to video ' +
                       r'$v$',
                       fontsize=label_fs)
    axes[2].set_ylabel('estimated link strength ' + r'$\beta_{u, v}$',
                       fontsize=label_fs)
    axes[2].set_title('(c)', fontsize=title_fs)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/model_prediction_results.pdf', bbox_inches='tight')
    plt.show()
예제 #21
0
def main():
    timer = Timer()
    timer.start()

    cornflower_blue = ColorPalette.BLUE
    tomato = ColorPalette.TOMATO
    color_cycle_4 = ColorPalette.CC4
    label_fs = ColorPalette.LABELFS
    title_fs = ColorPalette.TITLEFS
    tick_style = ColorPalette.TICKSTYLE

    data_loader = DataLoader()
    data_loader.load_video_views()
    embed_view_dict = data_loader.embed_view_dict
    embed_avg_train_view_dict = {
        embed: np.mean(embed_view_dict[embed][:-NUM_OUTPUT])
        for embed in embed_view_dict.keys()
    }
    data_loader.load_embed_content_dict()
    embed_cid_dict = data_loader.embed_cid_dict
    embed_genre_dict = data_loader.embed_genre_dict

    cid_artist_dict = {}
    cid_tag_dict = {}
    with open('../data/artist_details.json', 'r') as fin:
        for line in fin:
            artist_json = json.loads(line.rstrip())
            cid_artist_dict[
                artist_json['channel_id']] = artist_json['artist_name']
            cid_tag_dict[artist_json['channel_id']] = artist_json['tag-dict']

    cid_views_dict = defaultdict(int)
    cid_views_wo_network_dict = defaultdict(int)

    arnet_smape_list = []
    net_ratio_list = []
    same_artist_net_ratio_list = []
    same_genre_net_ratio_list = []
    total_views = 0
    network_explained_views = 0

    with open('./embed_prediction.json', 'r') as fin:
        for line in fin:
            result_json = json.loads(line.rstrip())
            tar_embed = result_json['embed']
            avg_train_views = embed_avg_train_view_dict[tar_embed]

            true_value = result_json['true_value']
            arnet_pred = result_json['arnet_pred']
            arnet_smape_list.append(smape(true_value, arnet_pred)[0])

            incoming_embeds = result_json['incoming_embeds']
            link_weights = result_json['link_weights']
            same_artist_contributed_views = 0
            same_genre_contributed_views = 0
            for edge_inx, src_embed in enumerate(incoming_embeds):
                if embed_cid_dict[tar_embed] == embed_cid_dict[src_embed]:
                    same_artist_contributed_views += link_weights[
                        edge_inx] * embed_avg_train_view_dict[src_embed]
                if is_same_genre(embed_genre_dict[tar_embed],
                                 embed_genre_dict[src_embed]):
                    same_genre_contributed_views += link_weights[
                        edge_inx] * embed_avg_train_view_dict[src_embed]

            # analyse network contribution
            arnet_net_ratio = result_json['net_ratio']
            net_ratio_list.append(arnet_net_ratio)
            # rounding issue can make the value slightly larger than 1
            same_artist_net_ratio_list.append(
                min(same_artist_contributed_views / avg_train_views, 1))
            same_genre_net_ratio_list.append(
                min(same_genre_contributed_views / avg_train_views, 1))

            cid_views_dict[embed_cid_dict[tar_embed]] += avg_train_views
            cid_views_wo_network_dict[embed_cid_dict[
                tar_embed]] += avg_train_views * (1 - arnet_net_ratio)

            total_views += avg_train_views
            network_explained_views += avg_train_views * arnet_net_ratio

    print(
        '\nFor an average video in our dataset, we estimate {0:.1f}% of the views come from the network.'
        .format(100 * np.mean(net_ratio_list)))
    print(
        'In particular, {0:.1f}% ({1:.1f}%) of the views come from the same artist.'
        .format(
            100 * np.mean(same_artist_net_ratio_list), 100 *
            np.mean(same_artist_net_ratio_list) / np.mean(net_ratio_list)))
    print(
        'In total, our model estimates that the recommendation network contributes {0:.1f}% of popularity in the Vevo network.'
        .format(100 * network_explained_views / total_views))
    print('total views for 13K: {0:.1f}M'.format(total_views / 1000000))
    print('explained views for 13K: {0:.1f}M'.format(network_explained_views /
                                                     1000000))
    print('total views for 60K: {0:.1f}M'.format(
        np.sum(list(embed_avg_train_view_dict.values())) / 1000000))
    print('Gini coef with network: {0:.4f}'.format(
        gini(list(cid_views_dict.values()))))
    print('Gini coef without network: {0:.4f}\n'.format(
        gini(list(cid_views_wo_network_dict.values()))))

    fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 4.2))
    gs = axes[0, 0].get_gridspec()
    for ax in axes[:, 1]:
        ax.remove()
    ax_mid = fig.add_subplot(gs[:, 1])
    for ax in axes[:, 2]:
        ax.remove()
    ax_right = fig.add_subplot(gs[:, 2])
    axes = [axes[0, 0], axes[1, 0], ax_mid, ax_right]

    # == == == == == == Part 1: Plot SMAPE vs. traffic composition == == == == == == #
    num_bin = 10
    sorted_same_artist_tuple_list = sorted(
        [(x, y) for x, y in zip(same_artist_net_ratio_list, arnet_smape_list)],
        key=lambda x: x[0])
    same_artist_split_values = [
        np.percentile(same_artist_net_ratio_list, x)
        for x in np.arange(10, 101, 10)
    ]
    same_artist_bins = [[] for _ in range(num_bin)]
    for same_artist_net_ratio, arnet_smape in sorted_same_artist_tuple_list:
        slice_idx = int(
            np.floor(
                percentileofscore(same_artist_net_ratio_list,
                                  same_artist_net_ratio) / 10))
        if slice_idx >= num_bin:
            slice_idx = num_bin - 1
        same_artist_bins[slice_idx].append(arnet_smape)

    sorted_same_genre_tuple_list = sorted(
        [(x, y) for x, y in zip(same_genre_net_ratio_list, arnet_smape_list)],
        key=lambda x: x[0])
    same_genre_split_values = [
        np.percentile(same_genre_net_ratio_list, x)
        for x in np.arange(10, 101, 10)
    ]
    same_genre_bins = [[] for _ in range(num_bin)]
    for same_genre_net_ratio, arnet_smape in sorted_same_genre_tuple_list:
        slice_idx = int(
            np.floor(
                percentileofscore(same_genre_net_ratio_list,
                                  same_genre_net_ratio) / 10))
        if slice_idx >= num_bin:
            slice_idx = num_bin - 1
        same_genre_bins[slice_idx].append(arnet_smape)

    axes[0].plot(range(1, 11, 1), [np.mean(x) for x in same_artist_bins],
                 color=cornflower_blue,
                 label='same artist',
                 mfc='none',
                 marker='o',
                 markersize=4)
    axes[1].plot(range(1, 11, 1), [np.mean(x) for x in same_genre_bins],
                 color=tomato,
                 label='same genre',
                 mfc='none',
                 marker='o',
                 markersize=4)

    for ax in [axes[0], axes[1]]:
        ax.set_xlim([0.5, 10.5])
        ax.set_ylim([7, 10.5])
        ax.set_ylabel('SMAPE', fontsize=label_fs)
        ax.xaxis.set_ticks(np.arange(1, 10, 2))
        ax.tick_params(**tick_style)
        ax.legend(frameon=False)

    axes[0].xaxis.set_major_formatter(
        FuncFormatter(lambda x, _: '({0:.3f})'.format(same_artist_split_values[
            int(x) - 1])))
    axes[1].xaxis.set_major_formatter(
        FuncFormatter(lambda x, _: '{0:.0f}%\n({1:.3f})'.format(
            10 * x, same_genre_split_values[int(x) - 1])))

    # axes[0].xaxis.set_major_formatter(
    #     FuncFormatter(lambda x, _: '({0:.3f})'.format(10 * x)))
    # axes[1].xaxis.set_major_formatter(
    #     FuncFormatter(lambda x, _: '{0:.0f}%\n({1:.3f})'.format(10 * x, 10 * x)))

    axes[1].set_xlabel('$\eta_v$ percentile', fontsize=label_fs)
    axes[0].set_title('(a)', fontsize=title_fs)

    # == == == == == == Part 2: Plot who can utilize the network better? == == == == == == #
    artist_views_list = list(cid_views_dict.values())
    wo_network_artist_views_list = list(cid_views_wo_network_dict.values())
    cid_list = sorted(cid_views_dict.keys())
    artist_true_percentile = [
        percentileofscore(artist_views_list, cid_views_dict[cid])
        for cid in cid_list
    ]
    wo_network_artist_percentile = [
        percentileofscore(wo_network_artist_views_list,
                          cid_views_wo_network_dict[cid]) for cid in cid_list
    ]
    percentile_change = np.array([
        artist_true_percentile[i] - wo_network_artist_percentile[i]
        for i in range(len(cid_list))
    ])

    num_popularity_loss = sum(percentile_change < 0)
    num_popularity_equal = sum(percentile_change == 0)
    num_popularity_gain = sum(percentile_change > 0)
    print('{0} ({1:.2f}%) artists lose popularity with network'.format(
        num_popularity_loss, num_popularity_loss / len(cid_list) * 100))
    print('{0} ({1:.2f}%) artists with no popularity change'.format(
        num_popularity_equal, num_popularity_equal / len(cid_list) * 100))
    print('{0} ({1:.2f}%) artists gain popularity with network\n'.format(
        num_popularity_gain, num_popularity_gain / len(cid_list) * 100))

    artist_percentile_mat = [[] for _ in range(10)]
    artist_cid_mat = [[] for _ in range(10)]
    for idx, percentile_value in enumerate(wo_network_artist_percentile):
        bin_idx = min(int(np.floor(percentile_value / 10)), 9)
        artist_percentile_mat[bin_idx].append(artist_true_percentile[idx] -
                                              percentile_value)
        artist_cid_mat[bin_idx].append(cid_list[idx])

    red_circle = dict(markerfacecolor=tomato, marker='o', markersize=4)
    axes[2].boxplot(artist_percentile_mat,
                    showfliers=True,
                    widths=0.5,
                    flierprops=red_circle)
    axes[2].axhline(y=0, color=cornflower_blue, linestyle='--', lw=1, zorder=0)
    axes[2].set_xlabel('artist popularity percentile without network',
                       fontsize=label_fs)
    axes[2].set_ylabel('percentile change with network', fontsize=label_fs)
    axes[2].tick_params(**tick_style)
    axes[2].set_xticks(axes[2].get_xticks()[::2])
    axes[2].xaxis.set_major_formatter(
        FuncFormatter(lambda x, _: '{0:.0f}%'.format(10 * x)))
    axes[2].yaxis.set_major_formatter(
        FuncFormatter(lambda x, _: '{0:.0f}%'.format(x)))
    axes[2].set_title('(b)', fontsize=12)

    # find outliers
    whis = 1.5
    top_outliers_list = []
    bottom_outliers_list = []
    for box_idx, box in enumerate(artist_percentile_mat):
        q1 = np.percentile(box, 25)
        q3 = np.percentile(box, 75)
        iq = q3 - q1
        hi_val = q3 + whis * iq
        lo_val = q1 - whis * iq
        for idx, val in enumerate(box):
            if val > hi_val:
                top_outliers_list.append((artist_cid_mat[box_idx][idx], val))
            elif val < lo_val:
                bottom_outliers_list.append(
                    (artist_cid_mat[box_idx][idx], val))

    sorted_top_outliers_list = sorted(
        [(cid_artist_dict[x[0]], cid_tag_dict[x[0]], int(
            cid_views_dict[x[0]]), x[1]) for x in top_outliers_list],
        key=lambda t: t[2],
        reverse=True)
    for t in sorted_top_outliers_list:
        print(t)
    print('-------------------')
    sorted_bottom_outliers_list = sorted(
        [(cid_artist_dict[x[0]], cid_tag_dict[x[0]], int(
            cid_views_dict[x[0]]), x[1]) for x in bottom_outliers_list],
        key=lambda t: t[2],
        reverse=True)
    for t in sorted_bottom_outliers_list:
        print(t)

    indie_xaxis, indie_yaxis = [], []
    rap_xaxis, rap_yaxis = [], []
    other_xaxis, other_yaxis = [], []
    lose_xaxis, lose_yaxis = [], []
    for top_outlier, _ in top_outliers_list:
        if 'indie' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'alternative' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'new wave' in ','.join(cid_tag_dict[top_outlier].keys()):
            indie_xaxis.append(cid_views_dict[top_outlier])
            indie_yaxis.append((cid_views_dict[top_outlier] -
                                cid_views_wo_network_dict[top_outlier]) /
                               cid_views_dict[top_outlier])
        elif 'rap' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'hip hop' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'rhythm and blues' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'reggae' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'punk' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'funk' in ','.join(cid_tag_dict[top_outlier].keys()) or \
                'r&b' in ','.join(cid_tag_dict[top_outlier].keys()):
            rap_xaxis.append(cid_views_dict[top_outlier])
            rap_yaxis.append((cid_views_dict[top_outlier] -
                              cid_views_wo_network_dict[top_outlier]) /
                             cid_views_dict[top_outlier])
        else:
            other_xaxis.append(cid_views_dict[top_outlier])
            other_yaxis.append((cid_views_dict[top_outlier] -
                                cid_views_wo_network_dict[top_outlier]) /
                               cid_views_dict[top_outlier])
    for bottom_outlier, _ in bottom_outliers_list:
        lose_xaxis.append(cid_views_dict[bottom_outlier])
        lose_yaxis.append((cid_views_dict[bottom_outlier] -
                           cid_views_wo_network_dict[bottom_outlier]) /
                          cid_views_dict[bottom_outlier])

    axes[3].scatter(indie_xaxis,
                    indie_yaxis,
                    marker='^',
                    facecolors='none',
                    edgecolors=color_cycle_4[0],
                    s=20,
                    label='Indie: {0}'.format(len(indie_xaxis)))
    axes[3].scatter(rap_xaxis,
                    rap_yaxis,
                    marker='o',
                    facecolors='none',
                    edgecolors=color_cycle_4[1],
                    s=20,
                    label='Hip hop: {0}'.format(len(rap_xaxis)))
    axes[3].scatter(other_xaxis,
                    other_yaxis,
                    marker='s',
                    facecolors='none',
                    edgecolors=color_cycle_4[2],
                    s=20,
                    label='Other: {0}'.format(len(other_xaxis)))
    # axes[3].scatter(lose_xaxis, lose_yaxis, marker='x', color=color_cycle_4[3], s=20, label='artists lose popularity: {0}'.format(len(bad_xaxis)))
    axes[3].set_ylim((-0.02, 1.02))
    axes[3].set_xscale('log')
    axes[3].set_xlabel('artist average daily views', fontsize=label_fs)
    axes[3].set_ylabel('network contribution ratio ' + '$\eta_v$',
                       fontsize=label_fs)
    axes[3].tick_params(**tick_style)
    axes[3].legend(frameon=False, loc='lower left')
    axes[3].set_title('(c)', fontsize=title_fs)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(w_pad=0.2)
    plt.savefig('../images/model_prediction_analysis.pdf', bbox_inches='tight')
    plt.show()
예제 #22
0
# PROJECT_DIR = os.getcwd()  # Must execute from project dir
PKG_DIR = os.path.dirname(__file__)
PROJECT_DIR = os.path.dirname(PKG_DIR)
release_dir = os.path.join(PROJECT_DIR, 'release', option_output_dir)
os.chdir(PROJECT_DIR)

# Add Paths
PATHS.append(os.path.join(PROJECT_DIR, 'bin'))
[sys.path.append(p) for p in PATHS]

# Additional Paths from Options
if option_directory:
    sys.path.append(option_directory)

logger.debug(sys.path)
logger.debug(arguments)
logger.debug(ASSEMBLIES)
if arguments['make']:
    timer = Timer()
    if not option_all:
        ASSEMBLIES = [option_assembly_name]

    for assembly_name in ASSEMBLIES:
        assembly_dict = make(release_dir,
                             assembly_name,
                             overwrite=option_overwrite,
                             quiet=option_all)
        if option_json:
            dump_json_log(assembly_dict)
    logger.info('Done: {} seconds'.format(timer.stop()))
def main():
    timer = Timer()
    timer.start()

    consumer_key = conf.twitter_consumer_key
    consumer_secret = conf.twitter_consumer_secret
    access_token = conf.twitter_access_token
    access_token_secret = conf.twitter_access_secret

    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = API(auth)

    app_name = 'mbfc'

    num_hit_twitter = 0
    num_fail_twitter = 0
    num_request = 0

    # searching Twitter Search API for tweet handles if we cannot find it on webpage
    with open('data/{0}/{0}_ratings_v4.csv'.format(app_name), 'w') as fout:
        with open('data/{0}/{0}_ratings_v3.csv'.format(app_name), 'r') as fin:
            fout.write(fin.readline())
            for line in fin:
                title, tail = line.rstrip().split(',', 1)
                middle, website_url, tw_handle, tw_sim, yt_id, yt_user = tail.rsplit(
                    ',', 5)
                if tw_handle == '':
                    print('===============')
                    print('media title', title)
                    print('website url', website_url)
                    # get the first 10 Twitter users
                    returned_users = api.search_users(title, count=10)
                    num_request += 1
                    to_write = True
                    for user in returned_users:
                        user_json = user._json
                        screen_name = user_json['screen_name'].lower()
                        if match_website_on_twitter_page(
                                user_json, website_url):
                            selected_tw_handle = screen_name
                            num_hit_twitter += 1
                            tw_similarity = SequenceMatcher(
                                None,
                                tldextract.extract(
                                    get_domain(website_url)).domain,
                                selected_tw_handle).ratio()
                            fout.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
                                title, middle, website_url, selected_tw_handle,
                                tw_similarity, yt_id, yt_user))
                            to_write = False
                            print('find twitter handle:', selected_tw_handle)
                            print('success index {0}/{2}: {1}'.format(
                                num_hit_twitter, title, num_request))
                            break
                    if to_write:
                        num_fail_twitter += 1
                        fout.write(line)
                        print('xxx failed to find for this media {0}, {1}!'.
                              format(title, website_url))
                        print('fail index {0}/{2}: {1}'.format(
                            num_fail_twitter, title, num_request))
                else:
                    fout.write(line)

    print('number of requests sent: {0}'.format(num_request))
    timer.stop()
def main():
    timer = Timer()
    timer.start()

    app_name = 'mbfc'

    num_hit_youtube = 0
    num_fail_youtube = 0
    num_search = 0

    with open('data/{0}/{0}_ratings_v5.csv'.format(app_name), 'w') as fout:
        with open('data/{0}/{0}_ratings_v4.csv'.format(app_name), 'r') as fin:
            fout.write(fin.readline())
            for line in fin:
                title, tail = line.rstrip().split(',', 1)
                middle, website_url, tw_handle, tw_sim, yt_id, yt_user = tail.rsplit(
                    ',', 5)
                if yt_id != '' or yt_user != '':
                    fout.write(line)
                else:
                    # searching YouTube search bar for youtube channels if we cannot find it on webpage
                    print('===============')
                    num_search += 1
                    num_search_results = 0
                    search_results = []
                    for _ in range(5):
                        if num_search_results == 0:
                            print(
                                'sent a request to YouTube search bar with title "{0}"...'
                                .format(title))
                            search_request = get_search_request(title)
                            print(search_request)
                            try:
                                search_response = requests.get(
                                    search_request,
                                    headers={
                                        'User-Agent':
                                        random.choice(USER_AGENT_LIST)
                                    })
                            except Exception as e:
                                print(str(e))
                                search_response = requests.get(search_request)
                            time.sleep(1)

                            if search_response:
                                html = search_response.text

                                try:
                                    initial_data = json.loads(
                                        find_value(
                                            html, 'window["ytInitialData"] = ',
                                            0, '\n').rstrip(';'))
                                    # print(json.dumps(initial_data))
                                except:
                                    continue

                                # get the first 10 YouTube channels
                                search_results = list(
                                    search_dict(initial_data,
                                                'channelRenderer'))[:10]
                                num_search_results = len(search_results)
                                print('find {0} search results'.format(
                                    num_search_results))

                    found_match = False
                    for search_result in search_results:
                        channel_title = search_result['title']['simpleText']
                        channel_id = search_result['navigationEndpoint'][
                            'browseEndpoint']['browseId']
                        print(channel_title, channel_id)
                        if channel_title != '':
                            print(
                                YOUTUBE_CHANNEL_ABOUT.format(
                                    channel_id=channel_id))
                            channel_response = requests.get(
                                YOUTUBE_CHANNEL_ABOUT.format(
                                    channel_id=channel_id))
                            time.sleep(1)
                            if match_links_on_youtube_page(
                                    channel_response, website_url, tw_handle):
                                found_match = True
                                yt_id = channel_id
                                break

                    if found_match:
                        num_hit_youtube += 1
                        print('success index {0}/{4}: {1}, {2}, {3}'.format(
                            num_hit_youtube, title, yt_id, '', num_search))
                        fout.write('{0},{1},{2},{3},{4},{5},{6}\n'.format(
                            title, middle, website_url, tw_handle, tw_sim,
                            yt_id, ''))
                    else:
                        num_fail_youtube += 1
                        print('fail index {0}/{2}: {1}'.format(
                            num_fail_youtube, title, num_search))
                        fout.write(line)

    timer.stop()
예제 #25
0
Usage: python duration_predictor.py -i ./ -o ./output -f re
Time: ~1M
"""

import os, sys, pickle, argparse
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score

sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
from utils.helper import Timer, write_dict_to_pickle
from utils.converter import to_watch_percentage


if __name__ == '__main__':
    # == == == == == == == == Part 1: Set up experiment parameters == == == == == == == == #
    timer = Timer()
    timer.start()

    test_vids = []
    test_duration = []
    true_engagement = []
    guess_engagement = []

    # == == == == == == == == Part 2: Load dataset == == == == == == == == #
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input', help='input file dir of formatted dataset', required=True)
    parser.add_argument('-o', '--output', help='output file dir of predictor result', required=True)
    parser.add_argument('-f', '--function', help='choose prediction target', required=True)
    args = parser.parse_args()

    input_dir = args.input
def main():
    timer = Timer()
    timer.start()

    cornflower_blue = ColorPalette.BLUE
    tomato = ColorPalette.TOMATO
    color_cycle_4 = ColorPalette.CC4
    label_fs = ColorPalette.LABELFS
    title_fs = ColorPalette.TITLEFS
    tick_style = ColorPalette.TICKSTYLE
    bar_text_style = ColorPalette.BARTEXTSTYLE

    data_loader = DataLoader()
    data_loader.load_embed_content_dict()
    embed_cid_dict = data_loader.embed_cid_dict
    embed_genre_dict = data_loader.embed_genre_dict

    fig, axes = plt.subplots(ncols=3, nrows=2, figsize=(12, 4))
    gs = axes[0, 0].get_gridspec()
    for ax in axes[:, 0]:
        ax.remove()
    ax_left = fig.add_subplot(gs[:, 0])
    for ax in axes[:, 1]:
        ax.remove()
    ax_mid = fig.add_subplot(gs[:, 1])
    axes = [ax_left, ax_mid, axes[0, 2], axes[1, 2]]

    # == == == == == == Part 1: Plot the probability of forming a persistent link == == == == == == #
    p_form_list = []
    p_persistent_list = []
    with open('./justify_persistent_link.log', 'r') as fin:
        for line in fin:
            _, p_form, _, p_persistent = re.split(',|:', line)
            p_form = float(p_form.strip())
            p_persistent = float(p_persistent.strip())
            p_form_list.append(p_form)
            p_persistent_list.append(p_persistent)

    axes[0].plot(p_form_list, p_persistent_list, color=cornflower_blue)
    for p_form in [0.5, 0.7, 0.8, 0.9]:
        p_persistent = p_persistent_list[int(p_form * 100)]
        axes[0].scatter(p_form,
                        p_persistent,
                        s=15,
                        c=tomato,
                        edgecolors='k',
                        zorder=30)
        axes[0].text(p_form - 0.01,
                     p_persistent,
                     '({0:.2f}, {1:.2f})'.format(p_form, p_persistent),
                     ha='right',
                     va='bottom')
    axes[0].set_xlabel('prob. of forming a link', fontsize=label_fs)
    axes[0].set_ylabel('prob. of being persistent link', fontsize=label_fs)
    axes[0].tick_params(**tick_style)
    axes[0].set_title('(a)', fontsize=title_fs)

    # == == == == == == Part 2: Plot the portion of persistent links that pass statistics test == == == == == == #
    log_files_list = [
        './random_pearsonr.log', './ephemeral_pearsonr.log',
        './persistent_pearsonr.log', './reciprocal_pearsonr.log'
    ]
    link_cnt_list = []
    sign_ratio_list = []
    same_artist_list = []
    sign_ratio_same_artist_list = []
    same_genre_list = []
    sign_ratio_same_genre_list = []
    for log_file in log_files_list:
        cnt = 0
        same_artist_cnt = 0
        same_genre_cnt = 0

        sign_cnt = 0
        sign_cnt_same_artist = 0
        sign_cnt_same_genre = 0

        with open(log_file, 'r') as fin:
            for line in fin:
                src_embed, tar_embed, r, p = line.rstrip().split(',')
                src_embed = int(src_embed)
                tar_embed = int(tar_embed)
                r = float(r)
                p = float(p)
                if p < 0.05:
                    sign_cnt += 1

                cnt += 1
                if embed_cid_dict[src_embed] == embed_cid_dict[tar_embed]:
                    same_artist_cnt += 1
                    if p < 0.05:
                        sign_cnt_same_artist += 1
                if is_same_genre(embed_genre_dict[src_embed],
                                 embed_genre_dict[tar_embed]):
                    same_genre_cnt += 1
                    if p < 0.05:
                        sign_cnt_same_genre += 1

        sign_ratio_list.append(sign_cnt / cnt)

        same_artist_list.append(same_artist_cnt / cnt)
        sign_ratio_same_artist_list.append(sign_cnt_same_artist / cnt)

        same_genre_list.append(same_genre_cnt / cnt)
        sign_ratio_same_genre_list.append(sign_cnt_same_genre / cnt)

        link_cnt_list.append(cnt)
        print(
            '#links: {0}, #sign links: {1}, #sign same artist: {2}, #sign same genre: {3}'
            .format(cnt, sign_cnt, sign_cnt_same_artist, sign_cnt_same_genre))

    ind = np.arange(len(log_files_list))
    axes[1].bar(ind,
                sign_ratio_list,
                0.6,
                edgecolor=['k'] * 4,
                color=color_cycle_4,
                lw=1.5,
                alpha=0.6)
    axes[1].set_ylim([0, axes[0].get_ylim()[1]])
    axes[1].set_ylabel('percentage of links with p<0.05', fontsize=label_fs)
    axes[1].set_xticklabels(
        ('', 'random' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[0]),
         'ephemeral' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[1]),
         'persistent' + r'$^{-}$' + '\n({0:,})'.format(link_cnt_list[2]),
         'reciprocal' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[3])))
    for tick in ind:
        axes[1].text(tick, sign_ratio_list[tick] + 0.01,
                     '{0:.3f}'.format(sign_ratio_list[tick]), **bar_text_style)
    axes[1].tick_params(**tick_style)
    axes[1].set_title('(b)', fontsize=title_fs)

    # == == == == == == Part 3: Plot the percentage of significant persistent links belong to the same artist or contain the same genre == == == == == == #
    axes[2].bar(ind,
                np.array(same_artist_list) -
                np.array(sign_ratio_same_artist_list),
                0.6,
                bottom=sign_ratio_same_artist_list,
                edgecolor=color_cycle_4,
                color=['w'] * 4,
                hatch='//',
                lw=1.5,
                alpha=0.6)
    axes[2].bar(ind,
                sign_ratio_same_artist_list,
                0.6,
                edgecolor=['k'] * 4,
                color=color_cycle_4,
                lw=1.5,
                alpha=0.6)
    axes[2].set_ylim([0, axes[0].get_ylim()[1]])
    axes[2].set_ylabel('same artist', fontsize=label_fs)
    axes[2].text(0, same_artist_list[0] + 0.01,
                 '{0:.3f}'.format(same_artist_list[0]), **bar_text_style)
    for tick in ind[1:]:
        axes[2].text(tick, same_artist_list[tick] + 0.01,
                     '{0:.3f}'.format(same_artist_list[tick]),
                     **bar_text_style)
        axes[2].text(tick, sign_ratio_same_artist_list[tick] + 0.01,
                     '{0:.3f}'.format(sign_ratio_same_artist_list[tick]),
                     **bar_text_style)
    axes[2].tick_params(**tick_style)
    axes[2].get_xaxis().set_visible(False)
    axes[2].set_title('(c)', fontsize=title_fs)

    axes[3].bar(ind,
                np.array(same_genre_list) -
                np.array(sign_ratio_same_genre_list),
                0.6,
                bottom=sign_ratio_same_genre_list,
                edgecolor=color_cycle_4,
                color=['w'] * 4,
                hatch='//',
                lw=1.5,
                alpha=0.6)
    axes[3].bar(ind,
                sign_ratio_same_genre_list,
                0.6,
                edgecolor=['k'] * 4,
                color=color_cycle_4,
                lw=1.5,
                alpha=0.6)
    axes[3].set_ylim([0, axes[0].get_ylim()[1]])
    axes[3].set_ylabel('same genre', fontsize=label_fs)
    for tick in ind:
        axes[3].text(tick, same_genre_list[tick] + 0.01,
                     '{0:.3f}'.format(same_genre_list[tick]), **bar_text_style)
        axes[3].text(tick, sign_ratio_same_genre_list[tick] + 0.01,
                     '{0:.3f}'.format(sign_ratio_same_genre_list[tick]),
                     **bar_text_style)
    axes[3].tick_params(**tick_style)
    axes[3].set_xticklabels(
        ('', 'random' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[0]),
         'ephemeral' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[1]),
         'persistent' + r'$^{-}$' + '\n({0:,})'.format(link_cnt_list[2]),
         'reciprocal' + r'$^{}$' + '\n({0:,})'.format(link_cnt_list[3])))

    hide_spines(axes)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/model_persistent_links.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    cc4 = ColorPalette.CC4
    blue = cc4[0]

    app_name = 'cyberbullying'
    rho = 0.5272
    entity = 'user'

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    print('for entity: {0}'.format(entity))
    sample_entity_freq_dict = defaultdict(int)
    with open('../data/{1}_out/{0}_{1}_all.txt'.format(entity, app_name), 'r') as sample_datefile:
        for line in sample_datefile:
            sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1

    complete_entity_freq_dict = defaultdict(int)
    with open('../data/{1}_out/complete_{0}_{1}.txt'.format(entity, app_name), 'r') as complete_datefile:
        for line in complete_datefile:
            complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1

    complete_to_sample_freq_dict = defaultdict(list)
    sample_to_complete_freq_dict = defaultdict(list)

    for item, complete_vol in complete_entity_freq_dict.items():
        if item in sample_entity_freq_dict:
            complete_to_sample_freq_dict[complete_vol].append(sample_entity_freq_dict[item])
        else:
            complete_to_sample_freq_dict[complete_vol].append(0)

    for item, sample_vol in sample_entity_freq_dict.items():
        sample_to_complete_freq_dict[sample_vol].append(complete_entity_freq_dict[item])

    for item in set(complete_entity_freq_dict.keys()) - set(sample_entity_freq_dict.keys()):
        sample_to_complete_freq_dict[0].append(complete_entity_freq_dict[item])

    ax1_x_axis = range(1, 101)

    ax1_y_axis = []
    empirical_mean_list = []
    expected_mean_list = []
    for num_sample in ax1_x_axis:
        # compute sample to complete
        empirical_cnt_dist = sample_to_complete_freq_dict[num_sample]
        neg_binomial_cnt_dist = []
        for x in range(num_sample, max(30, 3 * num_sample + 1)):
            neg_binomial_cnt_dist.extend([x] * int(negative_binomial(x, num_sample, rho) * len(empirical_cnt_dist)))
        ks_test = stats.ks_2samp(empirical_cnt_dist, neg_binomial_cnt_dist)
        empirical_mean = sum(empirical_cnt_dist) / len(empirical_cnt_dist)
        empirical_mean_list.append(empirical_mean)
        expected_mean = sum(neg_binomial_cnt_dist) / len(neg_binomial_cnt_dist)
        expected_mean_list.append(expected_mean)
        print('num_sample: {0}, number of Bernoulli trials: {1}, d_statistic: {2:.4f}, p: {3:.4f}, expected mean: {4:.2f}, empirical mean: {5:.2f}'
              .format(num_sample, len(empirical_cnt_dist), ks_test[0], ks_test[1], expected_mean, empirical_mean))
        ax1_y_axis.append(ks_test[0])

    axes[0].plot(ax1_x_axis, ax1_y_axis, c='k', lw=1.5, ls='-')

    axes[0].set_xlabel(r'sample frequency $n_s$', fontsize=16)
    axes[0].set_ylabel('D-statistic', fontsize=16)
    axes[0].set_xlim([-2, 102])
    axes[0].set_xticks([0, 25, 50, 75, 100])
    axes[0].set_ylim([0, 0.17])
    axes[0].yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{0:.2f}'.format(x)))
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', fontsize=18, pad=-3*72, y=1.0001)

    # show an example
    num_sample = np.argmin(ax1_y_axis) + 1

    axes[0].scatter(num_sample, ax1_y_axis[num_sample - 1], s=40, c=blue, zorder=30)
    axes[0].set_yticks([0, ax1_y_axis[num_sample - 1], 0.05, 0.1, 0.15])
    axes[0].plot([axes[0].get_xlim()[0], num_sample], [ax1_y_axis[num_sample - 1], ax1_y_axis[num_sample - 1]], color=blue, ls='--', lw=1)
    axes[0].plot([num_sample, num_sample], [axes[0].get_ylim()[0], ax1_y_axis[num_sample - 1]], color=blue, ls='--', lw=1)

    # plot sample to complete
    ax2_x_axis = range(num_sample, max(30, 3 * num_sample + 1))
    num_items = len(sample_to_complete_freq_dict[num_sample])
    sample_to_complete_cnt = Counter(sample_to_complete_freq_dict[num_sample])
    ax2_y_axis = [sample_to_complete_cnt[x] / num_items for x in ax2_x_axis]
    ax2_neg_binomial_axis = [negative_binomial(x, num_sample, rho) for x in ax2_x_axis]

    axes[1].plot(ax2_x_axis, ax2_y_axis, c=blue, lw=1.5, ls='-', marker='o', zorder=20, label='empirical')
    axes[1].plot(ax2_x_axis, ax2_neg_binomial_axis, c='k', lw=1.5, ls='-', marker='x', zorder=10, label='negative binomial')

    axes[1].set_xlabel(r'complete frequency $n_c$', fontsize=16)
    axes[1].set_ylabel(r'Pr($n_c$|$n_s$={0})'.format(num_sample), fontsize=16)
    axes[1].set_xticks([num_sample, 2 * num_sample, 3 * num_sample])
    axes[1].set_ylim([-0.005, 0.15])
    axes[1].set_yticks([0, 0.05, 0.1])
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='upper left')
    axes[1].set_title('(b)', fontsize=18, pad=-3*72, y=1.0001)

    axes[1].plot([empirical_mean_list[num_sample - 1], empirical_mean_list[num_sample - 1]], [axes[1].get_ylim()[0], 0.1], color=blue, ls='--', lw=1)
    axes[1].plot([expected_mean_list[num_sample - 1], expected_mean_list[num_sample - 1]], [axes[1].get_ylim()[0], 0.1], color='k', ls='--', lw=1)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/entity_negative_binomial.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
예제 #28
0
def main():
    timer = Timer()
    timer.start()

    cc4 = ColorPalette.CC4
    blue = cc4[0]

    fig, axes = plt.subplots(1, 2, figsize=(10, 3.3))

    timestamp_list = []
    sec_count_dict = defaultdict(int)
    ms_list = []

    with open('rate_limit_2015-09-08.txt', 'r') as fin:
        for line in fin:
            rate_json = json.loads(line.rstrip())
            ms_list.append(int(rate_json['limit']['timestamp_ms'][-3:]))
            timestamp = datetime.utcfromtimestamp(
                (int(rate_json['limit']['timestamp_ms']) - 666) // 1000)
            timestamp_list.append(timestamp)
            sec_count_dict[timestamp] += 1

    print('{0:.2f}% rate limit messages come from millisecond 700 to 1000'.
          format(len([x for x in ms_list if x >= 700]) / len(ms_list) * 100))

    sns.distplot(ms_list,
                 bins=200,
                 color=blue,
                 ax=axes[0],
                 kde_kws={
                     'shade': False,
                     'linewidth': 1.5,
                     'color': 'k'
                 })
    axes[0].set_xticks([0, 250, 500, 750, 1000])
    axes[0].set_xlim([-50, 1050])
    axes[0].set_xlabel('millisecond', fontsize=16)
    axes[0].set_ylabel('density', fontsize=16)
    axes[0].tick_params(axis='both', which='major', labelsize=16)
    axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001)

    sec_count_stats = Counter(sec_count_dict.values())
    x_axis = sorted(sec_count_stats.keys())
    axes[1].bar(x_axis, [sec_count_stats[x] for x in x_axis],
                facecolor=blue,
                edgecolor='k',
                width=0.7)
    axes[1].set_xticks([1, 2, 3, 4])
    axes[1].set_xlim([0, 5])
    axes[1].set_xlabel('#rate limit messages per second', fontsize=16)
    axes[1].set_ylabel('frequency', fontsize=16)
    axes[1].yaxis.set_major_formatter(FuncFormatter(concise_fmt))
    axes[1].tick_params(axis='both', which='major', labelsize=16)
    axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001)

    hide_spines(axes)

    timer.stop()

    plt.tight_layout(rect=[0, 0.05, 1, 1])
    plt.savefig('../images/SI_ratemsg_dist.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    n_cluster = 6

    complete_cluster_size_list = []
    for i in range(n_cluster):
        with open('{0}_cluster{1}.txt'.format('complete', i), 'r') as fin:
            complete_nodes = set(fin.readline().split(','))
            num_entities = len(complete_nodes)
            num_users = len([x for x in complete_nodes if x.startswith('u')])
            num_hashtags = num_entities - num_users
            complete_cluster_size_list.append((num_entities, num_users, num_hashtags))
    complete_sorted_by_size = sorted(enumerate(complete_cluster_size_list), key=lambda x: x[1][0], reverse=True)
    complete_sorted_by_size_copy = []
    complete_sorted_by_size_copy.append(complete_sorted_by_size[0])
    complete_sorted_by_size_copy.append(complete_sorted_by_size[4])
    complete_sorted_by_size_copy.append(complete_sorted_by_size[5])
    complete_sorted_by_size_copy.append(complete_sorted_by_size[3])
    complete_sorted_by_size_copy.append(complete_sorted_by_size[2])
    complete_sorted_by_size_copy.append(complete_sorted_by_size[1])
    complete_sorted_by_size = complete_sorted_by_size_copy
    print(complete_sorted_by_size)

    sample_cluster_size_list = []
    for i in range(n_cluster):
        with open('{0}_cluster{1}.txt'.format('sample', i), 'r') as fin:
            sample_nodes = set(fin.readline().split(','))
            num_entities = len(sample_nodes)
            num_users = len([x for x in sample_nodes if x.startswith('u')])
            num_hashtags = num_entities - num_users
            sample_cluster_size_list.append((num_entities, num_users, num_hashtags))
    sample_sorted_by_size = sorted(enumerate(sample_cluster_size_list), key=lambda x: x[1][0], reverse=True)
    sample_sorted_by_size_copy = []
    sample_sorted_by_size_copy.append(sample_sorted_by_size[0])
    sample_sorted_by_size_copy.append(sample_sorted_by_size[1])
    sample_sorted_by_size_copy.append(sample_sorted_by_size[4])
    sample_sorted_by_size_copy.append(sample_sorted_by_size[2])
    sample_sorted_by_size_copy.append(sample_sorted_by_size[5])
    sample_sorted_by_size_copy.append(sample_sorted_by_size[3])
    sample_sorted_by_size = sample_sorted_by_size_copy
    print(sample_sorted_by_size)

    complete_clusters_list = []
    for i, _ in complete_sorted_by_size:
        with open('{0}_cluster{1}.txt'.format('complete', i), 'r') as fin:
            complete_nodes = set(fin.readline().split(','))
            complete_clusters_list.append(complete_nodes)

    sample_clusters_list = []
    for i, _ in sample_sorted_by_size:
        with open('{0}_cluster{1}.txt'.format('sample', i), 'r') as fin:
            sample_nodes = set(fin.readline().split(','))
            sample_clusters_list.append(sample_nodes)

    col_labels = ['SC1', 'SC2', 'SC3', 'SC4', 'SC5', 'SC6', 'Missing', 'Total']
    row_labels = ['CC1', 'CC2', 'CC3', 'CC4', 'CC5', 'CC6', 'Total']
    n_row = len(row_labels)
    n_col = len(col_labels)
    confusion_mat = np.zeros(shape=(n_row, n_col))
    confusion_mat_rate = np.zeros(shape=(n_row, n_col))
    confusion_mat_annot = [[[] for _ in range(n_col)] for _ in range(n_row)]
    for i in range(n_row - 1):
        cnt0 = cnt = complete_sorted_by_size[i][1][0]
        print('from complete cluster {0}'.format(i + 1), cnt0)
        for j in range(n_col - 2):
            tmp = len(complete_clusters_list[i].intersection(sample_clusters_list[j]))
            print('>>> to sample cluster {0}: '.format(j + 1), tmp, tmp/cnt0)
            confusion_mat[i, j] = tmp
            confusion_mat_rate[i, j] = tmp / cnt0
            cnt -= tmp
            if tmp > 0:
                confusion_mat_annot[i][j] = '{0}\n{1:.1f}%'.format(concise_fmt(tmp, None), 100*tmp/cnt0)
            else:
                confusion_mat_annot[i][j] = '{0}'.format(concise_fmt(tmp, None))
        print('>>> to missing: ', cnt/cnt0)
        confusion_mat[i, -2] = cnt
        if cnt > 0:
            confusion_mat_annot[i][-2] = '{0}\n{1:.1f}%'.format(concise_fmt(cnt, None), 100*cnt / cnt0)
        else:
            confusion_mat_annot[i][-2] = '{0}'.format(concise_fmt(cnt, None))
        confusion_mat_rate[i, -2] = cnt / cnt0
        confusion_mat[i, -1] = cnt0
        # confusion_mat_rate[i, -1] = 0

    for j in range(n_row):
        # confusion_mat_annot[j][-1] = '{0}\n{1:.1f}%'.format(concise_fmt(confusion_mat[j, -1]), 100*confusion_mat[j, -1]/sum(confusion_mat[:-1, -1]))
        confusion_mat_annot[j][-1] = '{0}'.format(concise_fmt(confusion_mat[j, -1], None))
        # confusion_mat_annot[-1][j] = '{0}\n{1:.1f}%'.format(concise_fmt(sum(confusion_mat[:-1, j])), 100*sum(confusion_mat[:-1, j]) / sum(confusion_mat[:-1, -1]))
        confusion_mat_annot[-1][j] = '{0}'.format(concise_fmt(sum(confusion_mat[:-1, j]), None))
        # confusion_mat_rate[-1, j] = 0
    # confusion_mat_annot[-1][-1] = '{0}\n{1:.0f}%'.format(concise_fmt(sum(confusion_mat[:-1, -1])), 100)
    confusion_mat_annot[-1][-1] = '{0}'.format(concise_fmt(sum(confusion_mat[:-1, -1]), None))

    confusion_mat_annot = np.array(confusion_mat_annot)

    fig, ax1 = plt.subplots(1, 1)
    sns.heatmap(confusion_mat_rate, annot=confusion_mat_annot, cmap=ccmap,
                fmt='s', ax=ax1,
                cbar_kws={'label': 'ratio from complete clusters to sample clusters', 'shrink': .6})

    ax1.set_title('clusters in sample set', loc='right')
    ax1.set_title('clusters in complete set', loc='left')
    ax1.set_xticklabels(col_labels, ha='center')
    ax1.set_yticklabels(row_labels, rotation=90, va='center')
    ax1.xaxis.tick_top()
    ax1.hlines(y=0, xmin=n_col-1, xmax=n_col)
    ax1.hlines(y=n_row-1, xmin=0, xmax=n_col-1)
    ax1.hlines(y=n_row, xmin=0, xmax=n_col)
    ax1.vlines(x=0, ymin=n_row-1, ymax=n_row)
    ax1.vlines(x=n_col-1, ymin=0, ymax=n_row-1)
    ax1.vlines(x=n_col, ymin=0, ymax=n_row)

    cbar_ax = fig.axes[-1]
    cbar_ax.set_frame_on(True)

    timer.stop()

    plt.tight_layout(rect=[0.04, 0, 1, 1])
    plt.savefig('../images/measure_bipartite_cluster_flow.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()
def main():
    timer = Timer()
    timer.start()

    app_name = 'cyberbullying'

    hours_in_day = 24
    minutes_in_hour = 60
    seconds_in_minute = 60
    ms_in_second = 1000

    num_bins = 100
    width = ms_in_second // num_bins

    num_top = 500

    fig, axes = plt.subplots(1,
                             2,
                             figsize=(7.2, 4.8),
                             gridspec_kw={'width_ratios': [2.75, 3]})
    axes = axes.ravel()

    confusion_sampling_rate = np.load(
        '../data/{0}_out/{0}_confusion_sampling_rate.npy'.format(app_name))
    confusion_sampling_rate = np.nan_to_num(confusion_sampling_rate)

    load_external_data = True
    if not load_external_data:
        sample_entity_stats = defaultdict(int)
        with open('../data/{0}_out/user_{0}_all.txt'.format(app_name),
                  'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                sample_entity_stats[split_line[1]] += 1

        # == == == == == == Part 2: Plot entity rank == == == == == == #
        print('>>> found top {0} users in sample set...'.format(num_top))
        sample_top = [
            kv[0] for kv in sorted(sample_entity_stats.items(),
                                   key=lambda x: x[1],
                                   reverse=True)[:num_top]
        ]

        # == == == == == == Part 1: Find tweets appearing in complete set == == == == == == #
        complete_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        complete_post_lists_min = [[0] * minutes_in_hour
                                   for _ in range(num_top)]
        complete_post_lists_sec = [[0] * seconds_in_minute
                                   for _ in range(num_top)]
        complete_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        complete_entity_stats = defaultdict(int)
        with open('../data/{0}_out/complete_user_{0}.txt'.format(app_name),
                  'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                user_id = split_line[1]
                if user_id in sample_top:
                    complete_entity_stats[user_id] += 1

                    user_idx = sample_top.index(user_id)
                    tweet_id = split_line[0]
                    timestamp_ms = melt_snowflake(tweet_id)[0]
                    dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000)
                    hour = dt_obj.hour
                    minute = dt_obj.minute
                    second = dt_obj.second
                    millisec = timestamp_ms % 1000
                    ms_idx = (millisec - 7) // width if millisec >= 7 else (
                        1000 + millisec - 7) // width

                    complete_post_lists_hour[user_idx][hour] += 1
                    complete_post_lists_min[user_idx][minute] += 1
                    complete_post_lists_sec[user_idx][second] += 1
                    complete_post_lists_10ms[user_idx][ms_idx] += 1

        write_to_file('./complete_post_lists_hour.txt', sample_top,
                      complete_post_lists_hour)
        write_to_file('./complete_post_lists_min.txt', sample_top,
                      complete_post_lists_min)
        write_to_file('./complete_post_lists_sec.txt', sample_top,
                      complete_post_lists_sec)
        write_to_file('./complete_post_lists_10ms.txt', sample_top,
                      complete_post_lists_10ms)

        print('>>> finish dumping complete lists...')
        timer.stop()

        # == == == == == == Part 2: Find appearing tweets in sample set == == == == == == #
        sample_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)]
        sample_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)]
        sample_post_lists_sec = [[0] * seconds_in_minute
                                 for _ in range(num_top)]
        sample_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        estimated_post_lists_hour = [[0] * hours_in_day
                                     for _ in range(num_top)]
        estimated_post_lists_min = [[0] * minutes_in_hour
                                    for _ in range(num_top)]
        estimated_post_lists_sec = [[0] * seconds_in_minute
                                    for _ in range(num_top)]
        estimated_post_lists_10ms = [[0] * num_bins for _ in range(num_top)]

        hourly_conversion = np.mean(confusion_sampling_rate, axis=(1, 2, 3))
        minutey_conversion = np.mean(confusion_sampling_rate, axis=(2, 3))
        secondly_conversion = np.mean(confusion_sampling_rate, axis=(3))

        with open('../data/{0}_out/user_{0}_all.txt'.format(app_name),
                  'r') as fin:
            for line in fin:
                split_line = line.rstrip().split(',')
                user_id = split_line[1]
                if user_id in sample_top:
                    user_idx = sample_top.index(user_id)
                    tweet_id = split_line[0]
                    timestamp_ms = melt_snowflake(tweet_id)[0]
                    dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000)
                    hour = dt_obj.hour
                    minute = dt_obj.minute
                    second = dt_obj.second
                    millisec = timestamp_ms % 1000
                    ms_idx = (millisec - 7) // width if millisec >= 7 else (
                        1000 + millisec - 7) // width

                    sample_post_lists_hour[user_idx][hour] += 1
                    sample_post_lists_min[user_idx][minute] += 1
                    sample_post_lists_sec[user_idx][second] += 1
                    sample_post_lists_10ms[user_idx][ms_idx] += 1

                    estimated_post_lists_hour[user_idx][
                        hour] += 1 / hourly_conversion[hour]
                    estimated_post_lists_min[user_idx][
                        minute] += 1 / minutey_conversion[hour, minute]
                    estimated_post_lists_sec[user_idx][
                        second] += 1 / secondly_conversion[hour, minute,
                                                           second]
                    estimated_post_lists_10ms[user_idx][
                        ms_idx] += 1 / confusion_sampling_rate[hour, minute,
                                                               second, ms_idx]

        write_to_file('./sample_post_lists_hour.txt', sample_top,
                      sample_post_lists_hour)
        write_to_file('./sample_post_lists_min.txt', sample_top,
                      sample_post_lists_min)
        write_to_file('./sample_post_lists_sec.txt', sample_top,
                      sample_post_lists_sec)
        write_to_file('./sample_post_lists_10ms.txt', sample_top,
                      sample_post_lists_10ms)

        write_to_file('./estimated_post_lists_hour.txt', sample_top,
                      estimated_post_lists_hour)
        write_to_file('./estimated_post_lists_min.txt', sample_top,
                      estimated_post_lists_min)
        write_to_file('./estimated_post_lists_sec.txt', sample_top,
                      estimated_post_lists_sec)
        write_to_file('./estimated_post_lists_10ms.txt', sample_top,
                      estimated_post_lists_10ms)

        print('>>> finish dumping sample and estimated lists...')
        timer.stop()
    else:
        sample_top = []
        complete_post_lists_hour = []
        with open('./complete_post_lists_hour.txt', 'r') as fin:
            for line in fin:
                user_id, total, records = line.rstrip().split('\t')
                sample_top.append(user_id)
                records = list(map(int, records.split(',')))
                complete_post_lists_hour.append(records)

        sample_post_lists_hour = read_from_file('./sample_post_lists_hour.txt',
                                                dtype=0)
        sample_post_lists_min = read_from_file('./sample_post_lists_min.txt',
                                               dtype=0)
        sample_post_lists_sec = read_from_file('./sample_post_lists_sec.txt',
                                               dtype=0)
        sample_post_lists_10ms = read_from_file('./sample_post_lists_10ms.txt',
                                                dtype=0)

        estimated_post_lists_hour = read_from_file(
            './estimated_post_lists_hour.txt', dtype=1)
        estimated_post_lists_min = read_from_file(
            './estimated_post_lists_min.txt', dtype=1)
        estimated_post_lists_sec = read_from_file(
            './estimated_post_lists_sec.txt', dtype=1)
        estimated_post_lists_10ms = read_from_file(
            './estimated_post_lists_10ms.txt', dtype=1)

    # == == == == == == Part 3: Find the best estimation by comparing JS distance == == == == == == #
    ret = {}
    num_estimate_list = []
    num_sample_list = []
    num_complete_list = []

    sample_entity_stats = {
        user_id: sum(sample_post_lists_hour[user_idx])
        for user_idx, user_id in enumerate(sample_top)
    }
    complete_entity_stats = {
        user_id: sum(complete_post_lists_hour[user_idx])
        for user_idx, user_id in enumerate(sample_top)
    }

    min_mat = np.array([], dtype=np.int64).reshape(0, 60)
    sec_mat = np.array([], dtype=np.int64).reshape(0, 60)

    for user_idx, user_id in enumerate(sample_top):
        num_sample = sample_entity_stats[user_id]
        num_complete = complete_entity_stats[user_id]

        hour_entropy = entropy(sample_post_lists_hour[user_idx],
                               base=hours_in_day)
        min_entropy = entropy(sample_post_lists_min[user_idx],
                              base=minutes_in_hour)
        sec_entropy = entropy(sample_post_lists_sec[user_idx],
                              base=seconds_in_minute)
        ms10_entropy = entropy(sample_post_lists_10ms[user_idx], base=num_bins)

        min_mat = np.vstack(
            (min_mat, np.array(sample_post_lists_min[user_idx]).reshape(1,
                                                                        -1)))
        sec_mat = np.vstack(
            (sec_mat, np.array(sample_post_lists_sec[user_idx]).reshape(1,
                                                                        -1)))

        min_entropy, min_entropy_idx = min(
            (min_entropy, min_entropy_idx)
            for (min_entropy_idx, min_entropy
                 ) in enumerate([hour_entropy, min_entropy, sec_entropy]))
        if ms10_entropy < 0.87:
            min_entropy_idx = 3
        else:
            min_entropy_idx = 2
        # # if they are all very large
        # if min_entropy >= msly_entropy_benchmark:
        #     min_entropy_idx = 2

        num_estimate = sum([
            estimated_post_lists_hour[user_idx],
            estimated_post_lists_min[user_idx],
            estimated_post_lists_sec[user_idx],
            estimated_post_lists_10ms[user_idx]
        ][min_entropy_idx])
        num_estimate_list.append(num_estimate)

        num_sample_list.append(num_sample)
        num_complete_list.append(num_complete)

        ret[user_id] = (num_sample, num_complete, num_estimate,
                        min_entropy_idx)

    np.savetxt('min_sample.npy', min_mat, delimiter=',')
    np.savetxt('sec_sample.npy', sec_mat, delimiter=',')

    rank_by_sample = [
        k for k, v in sorted(
            ret.items(), key=lambda item: item[1][0], reverse=True)
    ]
    rank_by_complete = [
        k for k, v in sorted(
            ret.items(), key=lambda item: item[1][1], reverse=True)
    ]
    rank_by_estimated = [
        k for k, v in sorted(
            ret.items(), key=lambda item: item[1][2], reverse=True)
    ]

    for user_idx, user_id in enumerate(sample_top):
        print(user_id, ret[user_id][:-1],
              (rank_by_sample.index(user_id) + 1,
               rank_by_complete.index(user_id) + 1,
               rank_by_estimated.index(user_id) + 1))
        print(
            ret[user_id][0] / ret[user_id][1],
            mape(ret[user_id][1], ret[user_id][2])[0],
            rank_by_sample.index(user_id) - rank_by_complete.index(user_id),
            rank_by_estimated.index(user_id) - rank_by_complete.index(user_id))
        print(np.sum(np.array(sample_post_lists_min[user_idx]) > 0),
              np.sum(np.array(sample_post_lists_sec[user_idx]) > 0),
              np.sum(np.array(sample_post_lists_10ms[user_idx]) > 0))

    observed_top100 = rank_by_sample[:100]
    complete_rank_for_observed_top100 = [
        rank_by_complete.index(uid) + 1 for uid in observed_top100
    ]
    user_sampling_rates_for_observed_top100 = [
        sample_entity_stats[uid] / complete_entity_stats[uid]
        for uid in observed_top100
    ]
    print('kendall tau for observed',
          kendalltau(range(1, 101), complete_rank_for_observed_top100))

    estimated_top100 = rank_by_estimated[:100]
    complete_rank_for_estimated_top100 = [
        rank_by_complete.index(uid) + 1 for uid in estimated_top100
    ]
    user_sampling_rates_for_estimated_top100 = [
        sample_entity_stats[uid] / complete_entity_stats[uid]
        for uid in estimated_top100
    ]
    print('kendall tau for estimated',
          kendalltau(range(1, 101), complete_rank_for_estimated_top100))

    axes[0].scatter(range(1, 101),
                    complete_rank_for_observed_top100,
                    s=30,
                    c=user_sampling_rates_for_observed_top100,
                    edgecolors='gray',
                    vmin=0.2,
                    vmax=0.9,
                    cmap=cm,
                    zorder=50)
    axes[0].set_xlabel('observed rank in sample set', fontsize=13)
    axes[0].set_ylabel('rank in complete set', fontsize=13)
    axes[0].text(0.04,
                 0.9,
                 r"kendall's $\tau$: {0:.4f}".format(
                     kendalltau(range(1, 101),
                                complete_rank_for_observed_top100)[0]),
                 ha='left',
                 va='top',
                 size=12,
                 transform=axes[0].transAxes)
    axes[0].plot([0, 100], [100, 100], color='gray', ls='--', lw=1)
    axes[0].plot([100, 100], [0, 100], color='gray', ls='--', lw=1)
    axes[0].plot([0, 100], [0, 100], color='gray', ls='--', lw=1)
    axes[0].set_title('(a)', fontsize=13)

    sc = axes[1].scatter(range(1, 101),
                         complete_rank_for_estimated_top100,
                         s=30,
                         c=user_sampling_rates_for_estimated_top100,
                         edgecolors='gray',
                         vmin=0.2,
                         vmax=0.9,
                         cmap=cm,
                         zorder=50)
    axes[1].set_xlabel('estimated rank in sample set', fontsize=13)
    axes[1].plot([0, 100], [100, 100], color='gray', ls='--', lw=1)
    axes[1].plot([100, 100], [0, 100], color='gray', ls='--', lw=1)
    axes[1].plot([0, 100], [0, 100], color='gray', ls='--', lw=1)
    axes[1].text(0.04,
                 0.9,
                 r"kendall's $\tau$: {0:.4f}".format(
                     kendalltau(range(1, 101),
                                complete_rank_for_estimated_top100)[0]),
                 ha='left',
                 va='top',
                 size=12,
                 transform=axes[1].transAxes)
    axes[1].set_ylim(axes[0].get_ylim())
    axes[1].set_title('(b)', fontsize=13)

    cb = plt.colorbar(sc, fraction=0.055)
    cb.set_label(label='user sampling rate', size=13)
    cb.ax.tick_params(labelsize=11)

    for ax in axes[:2]:
        ax.set_xlim([-4, 104])
        ax.set_ylim(bottom=-4)
        ax.set_xticks([0, 50, 100])
        ax.set_yticks([0, 50, 100])
        ax.tick_params(axis='both', which='major', labelsize=11)

    timer.stop()

    plt.tight_layout()
    plt.savefig('../images/top_entity_rank.pdf', bbox_inches='tight')
    if not platform.system() == 'Linux':
        plt.show()