def main(): timer = Timer() timer.start() cc4 = ColorPalette.CC4 fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) start_idx = 21000 end_idx = 25500 timestamp_list = [] track_list = [] with open('rate_limit_2015-09-08.txt', 'r') as fin: for line in fin: rate_json = json.loads(line.rstrip()) track = rate_json['limit']['track'] track_list.append(track) timestamp = datetime.utcfromtimestamp( (int(rate_json['limit']['timestamp_ms'][:-3]))) timestamp_list.append(timestamp) axes[0].scatter(timestamp_list[start_idx:end_idx], track_list[start_idx:end_idx], c='k', s=0.4) axes[0].set_xlim([timestamp_list[start_idx], timestamp_list[end_idx]]) axes[0].xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S')) axes[0].set_xticks(axes[0].get_xticks()[::2]) axes[0].set_xlabel('Sep 08, 2015', fontsize=16) axes[0].set_ylabel('value', fontsize=16) axes[0].yaxis.set_major_formatter(FuncFormatter(concise_fmt)) axes[0].tick_params(axis='both', which='major', labelsize=16) axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001) print('start timestamp', timestamp_list[start_idx]) print('end timestamp', timestamp_list[end_idx]) split_track_lst, split_ts_lst = map_ratemsg( track_list[start_idx:end_idx], timestamp_list[start_idx:end_idx]) total_miss = 0 for track_lst, ts_lst, color in zip(split_track_lst, split_ts_lst, cc4): axes[1].scatter(ts_lst, track_lst, c=color, s=0.4) total_miss += (track_list[-1] - track_list[0]) print('{0} tweets are missing'.format(total_miss)) axes[1].set_xlim([timestamp_list[start_idx], timestamp_list[end_idx]]) axes[1].xaxis.set_major_formatter(mdates.DateFormatter('%H:%M:%S')) axes[1].set_xticks(axes[1].get_xticks()[::2]) axes[1].set_xlabel('Sep 08, 2015', fontsize=16) axes[1].yaxis.set_major_formatter(FuncFormatter(concise_fmt)) axes[1].tick_params(axis='both', which='major', labelsize=16) axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/SI_ratemsg_coloring.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() cc4 = ColorPalette.CC4 blue = cc4[0] app_name = 'cyberbullying' rho = 0.5272 entity = 'user' fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) print('for entity: {0}'.format(entity)) sample_entity_freq_dict = defaultdict(int) with open('../data/{1}_out/{0}_{1}_all.txt'.format(entity, app_name), 'r') as sample_datefile: for line in sample_datefile: sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1 complete_entity_freq_dict = defaultdict(int) with open('../data/{1}_out/complete_{0}_{1}.txt'.format(entity, app_name), 'r') as complete_datefile: for line in complete_datefile: complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1 complete_to_sample_freq_dict = defaultdict(list) sample_to_complete_freq_dict = defaultdict(list) for item, complete_vol in complete_entity_freq_dict.items(): if item in sample_entity_freq_dict: complete_to_sample_freq_dict[complete_vol].append(sample_entity_freq_dict[item]) else: complete_to_sample_freq_dict[complete_vol].append(0) for item, sample_vol in sample_entity_freq_dict.items(): sample_to_complete_freq_dict[sample_vol].append(complete_entity_freq_dict[item]) for item in set(complete_entity_freq_dict.keys()) - set(sample_entity_freq_dict.keys()): sample_to_complete_freq_dict[0].append(complete_entity_freq_dict[item]) ax1_x_axis = range(1, 101) ax1_y_axis = [] empirical_mean_list = [] expected_mean_list = [] for num_sample in ax1_x_axis: # compute sample to complete empirical_cnt_dist = sample_to_complete_freq_dict[num_sample] neg_binomial_cnt_dist = [] for x in range(num_sample, max(30, 3 * num_sample + 1)): neg_binomial_cnt_dist.extend([x] * int(negative_binomial(x, num_sample, rho) * len(empirical_cnt_dist))) ks_test = stats.ks_2samp(empirical_cnt_dist, neg_binomial_cnt_dist) empirical_mean = sum(empirical_cnt_dist) / len(empirical_cnt_dist) empirical_mean_list.append(empirical_mean) expected_mean = sum(neg_binomial_cnt_dist) / len(neg_binomial_cnt_dist) expected_mean_list.append(expected_mean) print('num_sample: {0}, number of Bernoulli trials: {1}, d_statistic: {2:.4f}, p: {3:.4f}, expected mean: {4:.2f}, empirical mean: {5:.2f}' .format(num_sample, len(empirical_cnt_dist), ks_test[0], ks_test[1], expected_mean, empirical_mean)) ax1_y_axis.append(ks_test[0]) axes[0].plot(ax1_x_axis, ax1_y_axis, c='k', lw=1.5, ls='-') axes[0].set_xlabel(r'sample frequency $n_s$', fontsize=16) axes[0].set_ylabel('D-statistic', fontsize=16) axes[0].set_xlim([-2, 102]) axes[0].set_xticks([0, 25, 50, 75, 100]) axes[0].set_ylim([0, 0.17]) axes[0].yaxis.set_major_formatter(FuncFormatter(lambda x, _: '{0:.2f}'.format(x))) axes[0].tick_params(axis='both', which='major', labelsize=16) axes[0].set_title('(a)', fontsize=18, pad=-3*72, y=1.0001) # show an example num_sample = np.argmin(ax1_y_axis) + 1 axes[0].scatter(num_sample, ax1_y_axis[num_sample - 1], s=40, c=blue, zorder=30) axes[0].set_yticks([0, ax1_y_axis[num_sample - 1], 0.05, 0.1, 0.15]) axes[0].plot([axes[0].get_xlim()[0], num_sample], [ax1_y_axis[num_sample - 1], ax1_y_axis[num_sample - 1]], color=blue, ls='--', lw=1) axes[0].plot([num_sample, num_sample], [axes[0].get_ylim()[0], ax1_y_axis[num_sample - 1]], color=blue, ls='--', lw=1) # plot sample to complete ax2_x_axis = range(num_sample, max(30, 3 * num_sample + 1)) num_items = len(sample_to_complete_freq_dict[num_sample]) sample_to_complete_cnt = Counter(sample_to_complete_freq_dict[num_sample]) ax2_y_axis = [sample_to_complete_cnt[x] / num_items for x in ax2_x_axis] ax2_neg_binomial_axis = [negative_binomial(x, num_sample, rho) for x in ax2_x_axis] axes[1].plot(ax2_x_axis, ax2_y_axis, c=blue, lw=1.5, ls='-', marker='o', zorder=20, label='empirical') axes[1].plot(ax2_x_axis, ax2_neg_binomial_axis, c='k', lw=1.5, ls='-', marker='x', zorder=10, label='negative binomial') axes[1].set_xlabel(r'complete frequency $n_c$', fontsize=16) axes[1].set_ylabel(r'Pr($n_c$|$n_s$={0})'.format(num_sample), fontsize=16) axes[1].set_xticks([num_sample, 2 * num_sample, 3 * num_sample]) axes[1].set_ylim([-0.005, 0.15]) axes[1].set_yticks([0, 0.05, 0.1]) axes[1].tick_params(axis='both', which='major', labelsize=16) axes[1].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='upper left') axes[1].set_title('(b)', fontsize=18, pad=-3*72, y=1.0001) axes[1].plot([empirical_mean_list[num_sample - 1], empirical_mean_list[num_sample - 1]], [axes[1].get_ylim()[0], 0.1], color=blue, ls='--', lw=1) axes[1].plot([expected_mean_list[num_sample - 1], expected_mean_list[num_sample - 1]], [axes[1].get_ylim()[0], 0.1], color='k', ls='--', lw=1) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/entity_negative_binomial.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() app_name = 'cyberbullying' sample_cascade_size = {} sample_inter_arrival_time = [] sample_cascade_influence = {} sample_cascade_influence_10m = defaultdict(int) sample_cascade_influence_1h = defaultdict(int) with open('../data/{0}_out/sample_retweet_{0}.txt'.format(app_name), 'r') as fin: for line in fin: root_tweet, cascades = line.rstrip().split(':') cascades = cascades.split(',') root_tweet = root_tweet.split('-')[0] retweets = [x.split('-')[0] for x in cascades] influences = [int(x.split('-')[1]) for x in cascades] sample_cascade_size[root_tweet] = len(retweets) sample_cascade_influence[root_tweet] = sum(influences) root_timestamp = melt_snowflake(root_tweet)[0] / 1000 retweet_timestamp_list = [root_timestamp] for i in range(len(retweets)): retweet_time = melt_snowflake(retweets[i])[0] / 1000 relative_retweet_time = retweet_time - root_timestamp retweet_timestamp_list.append( melt_snowflake(retweets[i])[0] / 1000) if relative_retweet_time < 10 * 60: sample_cascade_influence_10m[root_tweet] += influences[i] if relative_retweet_time < 60 * 60: sample_cascade_influence_1h[root_tweet] += influences[i] for i in range(len(retweet_timestamp_list) - 1): sample_inter_arrival_time.append(retweet_timestamp_list[i + 1] - retweet_timestamp_list[i]) complete_cascade_size = {} complete_inter_arrival_time = [] complete_cascade_influence = {} complete_cascade_influence_10m = defaultdict(int) complete_cascade_influence_1h = defaultdict(int) with open('../data/{0}_out/complete_retweet_{0}.txt'.format(app_name), 'r') as fin: for line in fin: root_tweet, cascades = line.rstrip().split(':') cascades = cascades.split(',') root_tweet = root_tweet.split('-')[0] retweets = [x.split('-')[0] for x in cascades] complete_cascade_size[root_tweet] = len(retweets) if len(retweets) >= 50: influences = [int(x.split('-')[1]) for x in cascades] complete_cascade_influence[root_tweet] = sum(influences) root_timestamp = melt_snowflake(root_tweet)[0] / 1000 retweet_timestamp_list = [root_timestamp] for i in range(len(retweets)): retweet_time = melt_snowflake(retweets[i])[0] / 1000 relative_retweet_time = retweet_time - root_timestamp retweet_timestamp_list.append( melt_snowflake(retweets[i])[0] / 1000) if relative_retweet_time < 10 * 60: complete_cascade_influence_10m[ root_tweet] += influences[i] if relative_retweet_time < 60 * 60: complete_cascade_influence_1h[ root_tweet] += influences[i] for i in range(len(retweet_timestamp_list) - 1): complete_inter_arrival_time.append( retweet_timestamp_list[i + 1] - retweet_timestamp_list[i]) print('number of cascades in the complete set', len(complete_cascade_size)) print('number of cascades in the sample set', len(sample_cascade_size)) print('mean complete size', np.mean(list(complete_cascade_size.values()))) print('mean sample size', np.mean(list(sample_cascade_size.values()))) print('complete #cascades (≥50 retweets)', sum([1 for x in list(complete_cascade_size.values()) if x >= 50])) print('sample #cascades (≥50 retweets)', sum([1 for x in list(sample_cascade_size.values()) if x >= 50])) num_complete_cascades_in_sample = 0 complete_cascades_in_sample_size_list = [] num_complete_cascades_in_sample_50 = 0 for root_tweet in sample_cascade_size: if sample_cascade_size[root_tweet] == complete_cascade_size[ root_tweet]: num_complete_cascades_in_sample += 1 complete_cascades_in_sample_size_list.append( complete_cascade_size[root_tweet]) if complete_cascade_size[root_tweet] >= 50: num_complete_cascades_in_sample_50 += 1 print('number of complete cascades in the sample set', num_complete_cascades_in_sample) print('number of complete cascades (>50 retweets) in the sample set', num_complete_cascades_in_sample_50) print('max: {0}, mean: {1}'.format( max(complete_cascades_in_sample_size_list), np.mean(complete_cascades_in_sample_size_list))) fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) cc4 = ColorPalette.CC4 blue = cc4[0] red = cc4[3] sample_median = np.median(sample_inter_arrival_time) complete_median = np.median(complete_inter_arrival_time) plot_ccdf(sample_inter_arrival_time, ax=axes[0], color=blue, ls='-', label='sample') plot_ccdf(complete_inter_arrival_time, ax=axes[0], color='k', ls='-', label='complete') axes[0].plot([sample_median, sample_median], [0, 1], color=blue, ls='--', lw=1) axes[0].plot([complete_median, complete_median], [0, 1], color='k', ls='--', lw=1) print('\ninter_arrival_time sample median', sample_median) print('inter_arrival_time complete median', complete_median) axes[0].set_xscale('symlog') axes[0].set_xticks([0, 1, 100, 10000, 1000000]) axes[0].set_yscale('linear') axes[0].set_xlabel('inter-arrival time (sec)', fontsize=16) axes[0].set_ylabel('$P(X \geq x)$', fontsize=16) axes[0].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='upper right') axes[0].tick_params(axis='both', which='major', labelsize=16) axes[0].set_title('(a)', fontsize=18, pad=-3 * 72, y=1.0001) influence_list = [] influence_list_10m = [] influence_list_1h = [] for root_tweet in sample_cascade_size: if complete_cascade_size[root_tweet] >= 50: if complete_cascade_influence[root_tweet] > 0: influence_list.append(sample_cascade_influence[root_tweet] / complete_cascade_influence[root_tweet]) if complete_cascade_influence_10m[root_tweet] > 0: influence_list_10m.append( sample_cascade_influence_10m[root_tweet] / complete_cascade_influence_10m[root_tweet]) if complete_cascade_influence_1h[root_tweet] > 0: influence_list_1h.append( sample_cascade_influence_1h[root_tweet] / complete_cascade_influence_1h[root_tweet]) plot_ccdf(influence_list_10m, ax=axes[1], color=red, ls='-', label='10m') plot_ccdf(influence_list_1h, ax=axes[1], color=blue, ls='-', label='1h') plot_ccdf(influence_list, ax=axes[1], color='k', ls='-', label='14d') print('influence_list median', np.median(influence_list)) print('influence_list_1h median', np.median(influence_list_1h)) print('influence_list_10m median', np.median(influence_list_10m)) print('influence_list 0.25', percentileofscore(influence_list, 0.25)) print('influence_list 0.25', percentileofscore(influence_list_1h, 0.25)) print('influence_list 0.25', percentileofscore(influence_list_10m, 0.25)) print('influence_list 0.75', percentileofscore(influence_list, 0.75)) print('influence_list 0.75', percentileofscore(influence_list_1h, 0.75)) print('influence_list 0.75', percentileofscore(influence_list_10m, 0.75)) axes[1].set_xscale('linear') axes[1].set_yscale('linear') axes[1].set_xlabel('relative potential reach', fontsize=16) # axes[1].set_ylabel('$P(X \geq x)$', fontsize=16) axes[1].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='upper right') axes[1].tick_params(axis='both', which='major', labelsize=16) axes[1].set_title('(b)', fontsize=18, pad=-3 * 72, y=1.0001) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/cascades_measures.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() cc4 = ColorPalette.CC4 blue = cc4[0] fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) timestamp_list = [] sec_count_dict = defaultdict(int) ms_list = [] with open('rate_limit_2015-09-08.txt', 'r') as fin: for line in fin: rate_json = json.loads(line.rstrip()) ms_list.append(int(rate_json['limit']['timestamp_ms'][-3:])) timestamp = datetime.utcfromtimestamp( (int(rate_json['limit']['timestamp_ms']) - 666) // 1000) timestamp_list.append(timestamp) sec_count_dict[timestamp] += 1 print('{0:.2f}% rate limit messages come from millisecond 700 to 1000'. format(len([x for x in ms_list if x >= 700]) / len(ms_list) * 100)) sns.distplot(ms_list, bins=200, color=blue, ax=axes[0], kde_kws={ 'shade': False, 'linewidth': 1.5, 'color': 'k' }) axes[0].set_xticks([0, 250, 500, 750, 1000]) axes[0].set_xlim([-50, 1050]) axes[0].set_xlabel('millisecond', fontsize=16) axes[0].set_ylabel('density', fontsize=16) axes[0].tick_params(axis='both', which='major', labelsize=16) axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001) sec_count_stats = Counter(sec_count_dict.values()) x_axis = sorted(sec_count_stats.keys()) axes[1].bar(x_axis, [sec_count_stats[x] for x in x_axis], facecolor=blue, edgecolor='k', width=0.7) axes[1].set_xticks([1, 2, 3, 4]) axes[1].set_xlim([0, 5]) axes[1].set_xlabel('#rate limit messages per second', fontsize=16) axes[1].set_ylabel('frequency', fontsize=16) axes[1].yaxis.set_major_formatter(FuncFormatter(concise_fmt)) axes[1].tick_params(axis='both', which='major', labelsize=16) axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/SI_ratemsg_dist.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() app_name = 'cyberbullying' archive_dir = '../data/{0}_out'.format(app_name) entities = ['user', 'hashtag'] rho = 0.5272 fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) cc4 = ColorPalette.CC4 blue = cc4[0] for ax_idx, entity in enumerate(entities): sample_datefile = open(os.path.join( archive_dir, '{0}_{1}_all.txt'.format(entity, app_name)), 'r', encoding='utf-8') complete_datefile = open(os.path.join( archive_dir, 'complete_{0}_{1}.txt'.format(entity, app_name)), 'r', encoding='utf-8') sample_entity_freq_dict = defaultdict(int) complete_entity_freq_dict = defaultdict(int) uni_random_entity_freq_dict = defaultdict(int) if entity == 'user': for line in sample_datefile: sample_entity_freq_dict[line.rstrip().split(',')[1]] += 1 for line in complete_datefile: complete_entity_freq_dict[line.rstrip().split(',')[1]] += 1 toss = np.random.random_sample() if toss <= rho: uni_random_entity_freq_dict[line.rstrip().split(',') [1]] += 1 else: for line in sample_datefile: for item in line.rstrip().split(',')[1:]: sample_entity_freq_dict[item.lower()] += 1 for line in complete_datefile: for item in line.rstrip().split(',')[1:]: complete_entity_freq_dict[item.lower()] += 1 toss = np.random.random_sample() if toss <= rho: for item in line.rstrip().split(',')[1:]: uni_random_entity_freq_dict[item.lower()] += 1 sample_datefile.close() complete_datefile.close() # compute the powerlaw fit in the complete set complete_freq_list = list(complete_entity_freq_dict.values()) complete_powerlaw_fit = Fit(complete_freq_list) complete_alpha = complete_powerlaw_fit.power_law.alpha complete_xmin = complete_powerlaw_fit.power_law.xmin print('{0} complete set alpha {1}, xmin {2}'.format( entity, complete_alpha, complete_xmin)) plot_ccdf(complete_freq_list, ax=axes[ax_idx], color='k', ls='-', label='complete') # compute the powerlaw fit in the sample set # infer the number of missing entities sample_freq_list = list(sample_entity_freq_dict.values()) sample_freq_counter = Counter(sample_freq_list) # we observe the frequency of entities appearing less than 100 times num_interest = 100 sample_freq_list_top100 = [0] * num_interest for freq in range(1, num_interest + 1): sample_freq_list_top100[freq - 1] = sample_freq_counter[freq] inferred_num_missing = infer_missing_num(sample_freq_list_top100, rho=rho, m=num_interest) corrected_sample_freq_list = sample_freq_list + [ 0 ] * inferred_num_missing sample_powerlaw_fit = Fit(corrected_sample_freq_list) sample_alpha = sample_powerlaw_fit.power_law.alpha sample_xmin = sample_powerlaw_fit.power_law.xmin print('{0} sample set alpha {1}, xmin {2}'.format( entity, sample_alpha, sample_xmin)) plot_ccdf(corrected_sample_freq_list, ax=axes[ax_idx], color=blue, ls='-', label='sample') # compute the powerlaw fit in uniform random sample uni_random_num_missing = len(complete_entity_freq_dict) - len( uni_random_entity_freq_dict) uni_random_freq_list = list(uni_random_entity_freq_dict.values()) uni_random_freq_list = uni_random_freq_list + [ 0 ] * uni_random_num_missing uni_random_powerlaw_fit = Fit(uni_random_freq_list) uni_random_alpha = uni_random_powerlaw_fit.power_law.alpha uni_random_xmin = uni_random_powerlaw_fit.power_law.xmin print('{0} uniform random sampling alpha {1}, xmin {2}'.format( entity, uni_random_alpha, uni_random_xmin)) plot_ccdf(uni_random_freq_list, ax=axes[ax_idx], color='k', ls='--', label='uniform random') print('inferred missing', inferred_num_missing) print('empirical missing', len(complete_entity_freq_dict) - len(sample_entity_freq_dict)) print('uniform random missing', uni_random_num_missing) print('KS test (sample, uniform)') print(stats.ks_2samp(corrected_sample_freq_list, uni_random_freq_list)) print('KS test (sample, complete)') print(stats.ks_2samp(corrected_sample_freq_list, complete_freq_list)) print('KS test (uniform, complete)') print(stats.ks_2samp(uni_random_freq_list, complete_freq_list)) axes[ax_idx].set_xscale('symlog') axes[ax_idx].set_yscale('log') axes[ax_idx].set_xlabel('frequency', fontsize=16) axes[ax_idx].tick_params(axis='both', which='major', labelsize=16) axes[0].set_xticks([0, 1, 100, 10000]) axes[0].set_yticks([1, 0.01, 0.0001, 0.000001]) axes[0].set_ylabel('$P(X \geq x)$', fontsize=16) axes[0].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='lower left') axes[0].set_title('(a) user posting', fontsize=18, pad=-3 * 72, y=1.0001) axes[1].set_xticks([0, 1, 100, 10000, 1000000]) axes[1].set_yticks([1, 0.1, 0.001, 0.00001]) axes[1].set_title('(b) hashtag', fontsize=18, pad=-3 * 72, y=1.0001) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/entity_freq_dist.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() app_name = 'youtube' archive_dir = './{0}_out'.format(app_name) lang_list = ['ja+ko', 'others'] cc4 = ColorPalette.CC4 red = cc4[3] num_days = 14 hours_in_day = 24 fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) sample_tid_set = set() sample_ts_datefile = os.path.join(archive_dir, 'ts_{0}_all.txt'.format(app_name)) with open(sample_ts_datefile, 'r') as fin: for line in fin: split_line = line.rstrip().split(',') if len(split_line) == 2: ts, tid = split_line sample_tid_set.add(tid) for idx, lang in enumerate(lang_list): if idx == 0: subcrawler_ts_datefiles = [ os.path.join(archive_dir, 'ts_{0}_{1}.txt'.format(app_name, j)) for j in [2, 3, 8, 9] ] else: subcrawler_ts_datefiles = [ os.path.join(archive_dir, 'ts_{0}_{1}.txt'.format(app_name, j)) for j in [1, 4, 5, 6, 7, 10, 11, 12] ] num_in = 0 num_out = 0 count_sample = np.zeros(shape=(hours_in_day, num_days)) count_complete = np.zeros(shape=(hours_in_day, num_days)) visited_tid_set = set() for ts_datefile in subcrawler_ts_datefiles: with open(ts_datefile, 'r') as fin: for line in fin: split_line = line.rstrip().split(',') if len(split_line) == 2: ts, tid = split_line if tid not in visited_tid_set: dt_obj = datetime.utcfromtimestamp(int(ts[:-3])) day_idx = dt_obj.day - 6 hour = dt_obj.hour count_complete[hour][day_idx] += 1 if tid in sample_tid_set: num_in += 1 count_sample[hour][day_idx] += 1 else: num_out += 1 visited_tid_set.add(tid) print( 'collected tweets: {0}, missing tweets: {1}, sample ratio for lang {2}: {3:.2f}%' .format(num_in, num_out, lang, num_in / (num_in + num_out) * 100)) # hourly tweet volume in youtube for some languages sample_volume_mean_list_hour = [] sample_ub_volume_mean_list_hour = [] sample_lb_volume_mean_list_hour = [] complete_volume_mean_list_hour = [] complete_ub_volume_mean_list_hour = [] complete_lb_volume_mean_list_hour = [] for j in range(hours_in_day): mean, lb, ub = mean_confidence_interval(count_sample[j, :], confidence=0.95) sample_volume_mean_list_hour.append(mean) sample_lb_volume_mean_list_hour.append(lb) sample_ub_volume_mean_list_hour.append(ub) mean, lb, ub = mean_confidence_interval(count_complete[j, :], confidence=0.95) complete_volume_mean_list_hour.append(mean) complete_lb_volume_mean_list_hour.append(lb) complete_ub_volume_mean_list_hour.append(ub) print('tweet volumes from JST-6pm to 12am: {0:.2f}%'.format( 100 * sum(complete_volume_mean_list_hour[9:15]) / sum(complete_volume_mean_list_hour))) print('sampling rates from JST-6pm to 12am: {0:.2f}%'.format( 100 * sum(sample_volume_mean_list_hour[9:15]) / sum(complete_volume_mean_list_hour[9:15]))) hour_x_axis = range(hours_in_day) axes[idx].plot(hour_x_axis, complete_volume_mean_list_hour, c='k', lw=1.5, ls='-', zorder=20, label='complete') axes[idx].fill_between(hour_x_axis, complete_ub_volume_mean_list_hour, complete_volume_mean_list_hour, facecolor='lightgray', lw=0, zorder=10) axes[idx].fill_between(hour_x_axis, complete_lb_volume_mean_list_hour, complete_volume_mean_list_hour, facecolor='lightgray', lw=0, zorder=10) axes[idx].plot(hour_x_axis, sample_volume_mean_list_hour, c='k', lw=1.5, ls='--', zorder=20, label='sample') axes[idx].fill_between(hour_x_axis, sample_ub_volume_mean_list_hour, sample_volume_mean_list_hour, facecolor=red, alpha=0.8, lw=0, zorder=10) axes[idx].fill_between(hour_x_axis, sample_lb_volume_mean_list_hour, sample_volume_mean_list_hour, facecolor=red, alpha=0.8, lw=0, zorder=10) if idx == 0: axes[idx].plot([9, 9], [complete_ub_volume_mean_list_hour[9], 150000], 'k--', lw=1) axes[idx].plot([15, 15], [complete_ub_volume_mean_list_hour[15], 150000], 'k--', lw=1) axes[idx].text(8, 150000, 'JST-6pm', ha='center', va='bottom', size=16) axes[idx].text(16, 150000, '12am', ha='center', va='bottom', size=16) axes[idx].annotate('', xy=(9, 135000), xycoords='data', xytext=(15, 135000), textcoords='data', arrowprops=dict(arrowstyle='<->', connectionstyle='arc3'), zorder=50) axes[idx].set_xlabel('hour (in UTC)', fontsize=16) axes[idx].set_xticks([0, 6, 12, 18, 24]) axes[idx].set_ylim([0, 180000]) axes[idx].set_yticks([0, 50000, 100000, 150000]) axes[idx].yaxis.set_major_formatter(FuncFormatter(concise_fmt)) axes[idx].tick_params(axis='both', which='major', labelsize=16) axes[idx].set_title('({0}) {1}'.format(['a', 'b'][idx], lang_list[idx]), size=18, pad=-3 * 72, y=1.0001) axes[0].set_ylabel('#tweets', fontsize=16) axes[1].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True, loc='lower right') hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/tweet_lang_vol.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() app_name = 'cyberbullying' hours_in_day = 24 minutes_in_hour = 60 seconds_in_minute = 60 ms_in_second = 1000 num_bins = 100 width = ms_in_second // num_bins num_top = 500 confusion_sampling_rate = np.load('../data/{0}_out/{0}_confusion_sampling_rate.npy'.format(app_name)) confusion_sampling_rate = np.nan_to_num(confusion_sampling_rate) load_external_data = False if not load_external_data: sample_entity_stats = defaultdict(int) with open('../data/{0}_out/user_{0}_all.txt'.format(app_name), 'r') as fin: for line in fin: split_line = line.rstrip().split(',') sample_entity_stats[split_line[1]] += 1 # == == == == == == Part 2: Plot entity rank == == == == == == # print('>>> found top {0} users in sample set...'.format(num_top)) sample_top = [kv[0] for kv in sorted(sample_entity_stats.items(), key=lambda x: x[1], reverse=True)[:num_top]] # == == == == == == Part 1: Find tweets appearing in complete set == == == == == == # complete_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)] complete_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)] complete_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)] complete_post_lists_10ms = [[0] * num_bins for _ in range(num_top)] complete_entity_stats = defaultdict(int) with open('../data/{0}_out/complete_user_{0}.txt'.format(app_name), 'r') as fin: for line in fin: split_line = line.rstrip().split(',') user_id = split_line[1] if user_id in sample_top: complete_entity_stats[user_id] += 1 user_idx = sample_top.index(user_id) tweet_id = split_line[0] timestamp_ms = melt_snowflake(tweet_id)[0] dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000) hour = dt_obj.hour minute = dt_obj.minute second = dt_obj.second millisec = timestamp_ms % 1000 ms_idx = (millisec-7) // width if millisec >= 7 else (1000 + millisec-7) // width complete_post_lists_hour[user_idx][hour] += 1 complete_post_lists_min[user_idx][minute] += 1 complete_post_lists_sec[user_idx][second] += 1 complete_post_lists_10ms[user_idx][ms_idx] += 1 write_to_file('./complete_post_lists_hour.txt', sample_top, complete_post_lists_hour) write_to_file('./complete_post_lists_min.txt', sample_top, complete_post_lists_min) write_to_file('./complete_post_lists_sec.txt', sample_top, complete_post_lists_sec) write_to_file('./complete_post_lists_10ms.txt', sample_top, complete_post_lists_10ms) print('>>> finish dumping complete lists...') timer.stop() # == == == == == == Part 2: Find appearing tweets in sample set == == == == == == # sample_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)] sample_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)] sample_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)] sample_post_lists_10ms = [[0] * num_bins for _ in range(num_top)] estimated_post_lists_hour = [[0] * hours_in_day for _ in range(num_top)] estimated_post_lists_min = [[0] * minutes_in_hour for _ in range(num_top)] estimated_post_lists_sec = [[0] * seconds_in_minute for _ in range(num_top)] estimated_post_lists_10ms = [[0] * num_bins for _ in range(num_top)] hourly_conversion = np.mean(confusion_sampling_rate, axis=(1, 2, 3)) minutey_conversion = np.mean(confusion_sampling_rate, axis=(2, 3)) secondly_conversion = np.mean(confusion_sampling_rate, axis=(3)) with open('../data/{0}_out/user_{0}_all.txt'.format(app_name), 'r') as fin: for line in fin: split_line = line.rstrip().split(',') user_id = split_line[1] if user_id in sample_top: user_idx = sample_top.index(user_id) tweet_id = split_line[0] timestamp_ms = melt_snowflake(tweet_id)[0] dt_obj = datetime.utcfromtimestamp(timestamp_ms // 1000) hour = dt_obj.hour minute = dt_obj.minute second = dt_obj.second millisec = timestamp_ms % 1000 ms_idx = (millisec-7) // width if millisec >= 7 else (1000 + millisec-7) // width sample_post_lists_hour[user_idx][hour] += 1 sample_post_lists_min[user_idx][minute] += 1 sample_post_lists_sec[user_idx][second] += 1 sample_post_lists_10ms[user_idx][ms_idx] += 1 estimated_post_lists_hour[user_idx][hour] += 1 / hourly_conversion[hour] estimated_post_lists_min[user_idx][minute] += 1 / minutey_conversion[hour, minute] estimated_post_lists_sec[user_idx][second] += 1 / secondly_conversion[hour, minute, second] estimated_post_lists_10ms[user_idx][ms_idx] += 1 / confusion_sampling_rate[hour, minute, second, ms_idx] write_to_file('./sample_post_lists_hour.txt', sample_top, sample_post_lists_hour) write_to_file('./sample_post_lists_min.txt', sample_top, sample_post_lists_min) write_to_file('./sample_post_lists_sec.txt', sample_top, sample_post_lists_sec) write_to_file('./sample_post_lists_10ms.txt', sample_top, sample_post_lists_10ms) write_to_file('./estimated_post_lists_hour.txt', sample_top, estimated_post_lists_hour) write_to_file('./estimated_post_lists_min.txt', sample_top, estimated_post_lists_min) write_to_file('./estimated_post_lists_sec.txt', sample_top, estimated_post_lists_sec) write_to_file('./estimated_post_lists_10ms.txt', sample_top, estimated_post_lists_10ms) print('>>> finish dumping sample and estimated lists...') timer.stop() else: sample_top = [] complete_post_lists_hour = [] with open('./complete_post_lists_hour.txt', 'r') as fin: for line in fin: user_id, total, records = line.rstrip().split('\t') sample_top.append(user_id) records = list(map(int, records.split(','))) complete_post_lists_hour.append(records) complete_post_lists_min = read_from_file('./complete_post_lists_min.txt', dtype=0) complete_post_lists_sec = read_from_file('./complete_post_lists_sec.txt', dtype=0) complete_post_lists_10ms = read_from_file('./complete_post_lists_10ms.txt', dtype=0) sample_post_lists_hour = read_from_file('./sample_post_lists_hour.txt', dtype=0) sample_post_lists_min = read_from_file('./sample_post_lists_min.txt', dtype=0) sample_post_lists_sec = read_from_file('./sample_post_lists_sec.txt', dtype=0) sample_post_lists_10ms = read_from_file('./sample_post_lists_10ms.txt', dtype=0) estimated_post_lists_hour = read_from_file('./estimated_post_lists_hour.txt', dtype=1) estimated_post_lists_min = read_from_file('./estimated_post_lists_min.txt', dtype=1) estimated_post_lists_sec = read_from_file('./estimated_post_lists_sec.txt', dtype=1) estimated_post_lists_10ms = read_from_file('./estimated_post_lists_10ms.txt', dtype=1) # == == == == == == Part 3: Find the best estimation by comparing JS distance == == == == == == # ret = {} num_estimate_list = [] num_sample_list = [] num_complete_list = [] sample_entity_stats = {user_id: sum(sample_post_lists_hour[user_idx]) for user_idx, user_id in enumerate(sample_top)} complete_entity_stats = {user_id: sum(complete_post_lists_hour[user_idx]) for user_idx, user_id in enumerate(sample_top)} min_mat = np.array([], dtype=np.int64).reshape(0, 60) sec_mat = np.array([], dtype=np.int64).reshape(0, 60) for user_idx, user_id in enumerate(sample_top): num_sample = sample_entity_stats[user_id] num_complete = complete_entity_stats[user_id] hour_entropy = entropy(sample_post_lists_hour[user_idx], base=hours_in_day) min_entropy = entropy(sample_post_lists_min[user_idx], base=minutes_in_hour) sec_entropy = entropy(sample_post_lists_sec[user_idx], base=seconds_in_minute) ms10_entropy = entropy(sample_post_lists_10ms[user_idx], base=num_bins) min_mat = np.vstack((min_mat, np.array(sample_post_lists_min[user_idx]).reshape(1, -1))) sec_mat = np.vstack((sec_mat, np.array(sample_post_lists_sec[user_idx]).reshape(1, -1))) if ms10_entropy < 0.87: min_entropy_idx = 3 else: min_entropy_idx = 0 num_estimate = sum([estimated_post_lists_hour[user_idx], estimated_post_lists_min[user_idx], estimated_post_lists_sec[user_idx], estimated_post_lists_10ms[user_idx]][min_entropy_idx]) num_estimate_list.append(num_estimate) num_sample_list.append(num_sample) num_complete_list.append(num_complete) ret[user_id] = (num_sample, num_complete, num_estimate, min_entropy_idx) # == == == == == == Part 3: Plot case users == == == == == == # case_user_ids = ['1033778124968865793', '1182605743335211009'] case_user_screennames = ['WeltRadio', 'bensonbersk'] fig, axes = plt.subplots(1, 2, figsize=(7.2, 2.3)) cc4 = ColorPalette.CC4 blue = cc4[0] red = cc4[3] filled_colors = [blue, red] labels = ['(c)', '(d)'] for ax_idx, user_id in enumerate(case_user_ids): user_idx = sample_top.index(user_id) min_entropy_idx = ret[user_id][-1] if min_entropy_idx == 0: axes[ax_idx].bar(range(hours_in_day), complete_post_lists_hour[user_idx], color='lightgray', width=1) axes[ax_idx].bar(range(hours_in_day), sample_post_lists_hour[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1) axes[ax_idx].plot(range(hours_in_day), estimated_post_lists_hour[user_idx], 'k-', lw=1.5) axes[ax_idx].set_xlabel('hour', fontsize=12) axes[ax_idx].set_xlim([-1, hours_in_day+1]) axes[ax_idx].set_xticks([0, 6, 12, 18, 24]) axes[ax_idx].set_title('{0} {1}'.format(labels[ax_idx], case_user_screennames[ax_idx]), fontsize=13) elif min_entropy_idx == 1: axes[ax_idx].bar(range(minutes_in_hour), complete_post_lists_min[user_idx], color='lightgray', width=1) axes[ax_idx].bar(range(minutes_in_hour), sample_post_lists_min[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1) axes[ax_idx].plot(range(minutes_in_hour), estimated_post_lists_min[user_idx], 'k-', lw=1.5) axes[ax_idx].set_xlabel('minute', fontsize=12) axes[ax_idx].set_xlim([-1, minutes_in_hour+1]) axes[ax_idx].set_xticks([0, 15, 30, 45, 60]) elif min_entropy_idx == 2: axes[ax_idx].bar(range(seconds_in_minute), complete_post_lists_sec[user_idx], color='lightgray', width=1) axes[ax_idx].bar(range(seconds_in_minute), sample_post_lists_sec[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1) axes[ax_idx].plot(range(seconds_in_minute), estimated_post_lists_sec[user_idx], 'k-', lw=1.5) axes[ax_idx].set_xlabel('second', fontsize=12) axes[ax_idx].set_xlim([-1, seconds_in_minute+1]) axes[ax_idx].set_xticks([0, 15, 30, 45, 60]) elif min_entropy_idx == 3: axes[ax_idx].bar(range(num_bins), complete_post_lists_10ms[user_idx], color='lightgray', width=1) axes[ax_idx].bar(range(num_bins), sample_post_lists_10ms[user_idx], color=filled_colors[ax_idx], alpha=0.8, width=1) axes[ax_idx].plot(range(num_bins), estimated_post_lists_10ms[user_idx], 'k-', lw=1.5) axes[ax_idx].set_xlabel('millisecond', fontsize=12) axes[ax_idx].set_xlim([-3, num_bins+3]) axes[ax_idx].set_xticks([0, 25, 50, 75, 100]) axes[ax_idx].xaxis.set_major_formatter(FuncFormatter(lambda x, _: 10*x)) axes[ax_idx].tick_params(axis='both', which='major', labelsize=11) axes[ax_idx].set_title('{0} {1}'.format(labels[ax_idx], case_user_screennames[ax_idx]), fontsize=13) axes[0].set_ylabel('volume', fontsize=12) hide_spines(axes) timer.stop() plt.tight_layout() plt.savefig('../images/suspicious_users.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()
def main(): timer = Timer() timer.start() cc4 = ColorPalette.CC4 blue = cc4[0] red = cc4[3] fig, axes = plt.subplots(1, 2, figsize=(10, 3.3)) num_days = 14 hours_in_day = 24 hour_x_axis = range(hours_in_day) minutes_in_hour = 60 seconds_in_minute = 60 ms_in_second = 1000 ms_bins = 100 width = ms_in_second // ms_bins ms_x_axis = range(ms_in_second) app_conf = { 'cyberbullying': { 'min_date': '2019-10-13', 'label': 'Cyberbullying', 'color': blue }, 'youtube': { 'min_date': '2019-11-06', 'label': 'YouTube', 'color': red } } for app_name in app_conf.keys(): archive_dir = './{0}_out'.format(app_name) min_date = datetime.strptime(app_conf[app_name]['min_date'], '%Y-%m-%d').replace(tzinfo=timezone.utc) min_timestamp = int(min_date.timestamp()) min_day = min_date.day sample_datefile = open( os.path.join(archive_dir, 'ts_{0}_all.txt'.format(app_name)), 'r') complete_datefile = open( os.path.join(archive_dir, 'complete_ts_{0}.txt'.format(app_name)), 'r') sample_tid_set = set() hour_hit_mat = np.zeros(shape=(hours_in_day, num_days)) hour_miss_mat = np.zeros(shape=(hours_in_day, num_days)) ms_hit_mat = np.zeros(shape=(ms_in_second, num_days * hours_in_day)) ms_miss_mat = np.zeros(shape=(ms_in_second, num_days * hours_in_day)) confusion_hit_mat = np.zeros(shape=(hours_in_day, minutes_in_hour, seconds_in_minute, ms_bins)) confusion_miss_mat = np.zeros(shape=(hours_in_day, minutes_in_hour, seconds_in_minute, ms_bins)) for line in sample_datefile: split_line = line.rstrip().split(',') if len(split_line) == 2: sample_tid_set.add(split_line[1]) for line in complete_datefile: split_line = line.rstrip().split(',') if len(split_line) == 2: timestamp_ms = int(split_line[0][:-3]) if timestamp_ms >= min_timestamp: dt_obj = datetime.utcfromtimestamp(timestamp_ms) day_idx = dt_obj.day - min_day hour = dt_obj.hour minute = dt_obj.minute second = dt_obj.second millisec = int(split_line[0][-3:]) ms_idx = (millisec - 7) // width if millisec >= 7 else ( ms_in_second + millisec - 7) // width if split_line[1] in sample_tid_set: hour_hit_mat[hour][day_idx] += 1 ms_hit_mat[millisec][hours_in_day * day_idx + hour] += 1 confusion_hit_mat[hour][minute][second][ms_idx] += 1 else: hour_miss_mat[hour][day_idx] += 1 ms_miss_mat[millisec][hours_in_day * day_idx + hour] += 1 confusion_miss_mat[hour][minute][second][ms_idx] += 1 # hourly tweet sampling rate rho_mean_list_hour = [] ub_rho_mean_list_hour = [] lb_rho_mean_list_hour = [] for i in hour_x_axis: mean, lb, ub = mean_confidence_interval( hour_hit_mat[i, :] / (hour_hit_mat[i, :] + hour_miss_mat[i, :]), confidence=0.95) rho_mean_list_hour.append(mean) lb_rho_mean_list_hour.append(lb) ub_rho_mean_list_hour.append(ub) # confusion sampling rate confusion_sampling_rate = confusion_hit_mat / (confusion_hit_mat + confusion_miss_mat) confusion_sampling_rate = np.nan_to_num(confusion_sampling_rate) np.save( os.path.join(archive_dir, '{0}_confusion_sampling_rate.npy'.format(app_name)), confusion_sampling_rate) axes[0].plot(hour_x_axis, rho_mean_list_hour, c='k', lw=1.5, ls='-', zorder=20) axes[0].fill_between(hour_x_axis, ub_rho_mean_list_hour, rho_mean_list_hour, facecolor=app_conf[app_name]['color'], alpha=0.8, lw=0, zorder=10) axes[0].fill_between(hour_x_axis, lb_rho_mean_list_hour, rho_mean_list_hour, facecolor=app_conf[app_name]['color'], alpha=0.8, lw=0, zorder=10, label='{0}'.format(app_conf[app_name]['label'])) # msly tweet sampling rate rho_mean_list_ms = [] ub_rho_mean_list_ms = [] lb_rho_mean_list_ms = [] for i in ms_x_axis: mean, lb, ub = mean_confidence_interval( ms_hit_mat[i, :] / (ms_hit_mat[i, :] + ms_miss_mat[i, :]), confidence=0.95) rho_mean_list_ms.append(mean) lb_rho_mean_list_ms.append(lb) ub_rho_mean_list_ms.append(ub) axes[1].plot(ms_x_axis, rho_mean_list_ms, c='k', lw=1.5, ls='-', zorder=20) axes[1].fill_between(ms_x_axis, ub_rho_mean_list_ms, rho_mean_list_ms, facecolor=app_conf[app_name]['color'], alpha=0.8, lw=0, zorder=10) axes[1].fill_between(ms_x_axis, lb_rho_mean_list_ms, rho_mean_list_ms, facecolor=app_conf[app_name]['color'], alpha=0.8, lw=0, zorder=10) axes[0].set_xticks([0, 6, 12, 18, 24]) axes[0].set_xlabel('hour (in UTC)', fontsize=16) axes[0].set_ylim([-0.05, 1.05]) axes[0].set_yticks([0, 0.25, 0.5, 0.75, 1.0]) axes[0].set_ylabel(r'sampling rate $\rho_t$', fontsize=16) axes[0].tick_params(axis='both', which='major', labelsize=16) axes[0].legend(frameon=False, fontsize=16, ncol=1, fancybox=False, shadow=True) axes[0].set_title('(a)', size=18, pad=-3 * 72, y=1.0001) axes[1].axvline(x=657, ymin=0, ymax=0.4, c='k', ls='--') axes[1].text(667, 0.2, 'x=657', size=18, ha='left', va='center') axes[1].set_xticks([0, 250, 500, 750, 1000]) axes[1].set_xlabel('millisecond', fontsize=16) axes[1].set_ylim([-0.05, 1.05]) axes[1].set_yticks([0, 0.25, 0.5, 0.75, 1.0]) axes[1].tick_params(axis='both', which='major', labelsize=16) axes[1].set_title('(b)', size=18, pad=-3 * 72, y=1.0001) hide_spines(axes) timer.stop() plt.tight_layout(rect=[0, 0.05, 1, 1]) plt.savefig('../images/temporal_sampling_rates.pdf', bbox_inches='tight') if not platform.system() == 'Linux': plt.show()