def top_level_comment_response_dist(code, cascades = False, comments = False, bin_minutes = 1, remove_first = True): #load data if missing if cascades == False or comments == False: cascades, comments, missing_posts, missing_comments = build_cascades(code) print("\nComputing top-level comment response time distribution") #response time dictionary: time in minutes -> number of responses with that delay response_times = defaultdict(int) #for each post, look at all top-level replies for post_id, post in cascades.items(): #loop posts #if this post is a dummy object, throw an error to the user and move on if post['placeholder']: print("Data contains placeholder post. Please use remove_missing to filter out incomplete cascades first.") exit(0) post_time = post['created_utc'] #grab post time to compute reply delay for comment_id in post['replies']: #loop replies #get response time in minutes for this comment response_time = int((comments[comment_id]['created_utc'] - post_time) / (bin_minutes * 60.0)) * bin_minutes #if response time is somehow negative, throw an error message but keep running if response_time < 0: print("Warning: negative response time!") #add one to counter for this response time (binned by minutes) response_times[response_time] += 1 #throw out first minute (bots) if remove_first == True: response_times.pop(0, None) #convert frequencies to probability distribution function total = sum(response_times.values()) for key in response_times.keys(): response_times[key] /= total #save response time distribution, but only if bin_minutes = 1 if bin_minutes == 1: print("Saving top-level comment response time distribution to results/%s_top_level_comment_response_time_dist_%s_<options>.json" % (code, bin_minutes)) file_utils.verify_dir("results") file_utils.save_json(response_times, "results/%s_top_level_comment_response_time_dist_%s.json" % (code, bin_minutes)) #plot everything print("Plotting top-level comment response time distribution to plots/%s_top_level_comment_response_times_%s.png" % (code, bin_minutes)) file_utils.verify_dir("plots") plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_log.png" % (code, bin_minutes), x_min = 0, log_scale_x = True, log_scale_y = True) plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_zoom_log.png" % (code, bin_minutes), x_min = 0, x_max = 60*24, log_scale_x = True, log_scale_y = True) plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_zoom.png" % (code, bin_minutes), x_min = 0, x_max = 60*24)
def save_cascades(code, cascades, filtered=False): if filtered == False: file_utils.verify_dir("data_cache/%s_cascades" % code) print( "Saving cascades to data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) file_utils.save_pickle( cascades, "data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code)) else: file_utils.verify_dir("data_cache/filtered_cascades") print( "Saving filtered cascades to data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, filtered)) file_utils.save_pickle( cascades, "data_cache/filtered_cascades/%s_%s_cascades.pkl" % (code, filtered))
def save_cascade_params(code, cascade_params, filtered=False): if filtered == False: file_utils.verify_dir("data_cache/fitted_params/") print( "Saving cascade params to data_cache/fitted_params/%s_cascade_params.pkl" % code) file_utils.save_pickle( cascade_params, "data_cache/fitted_params/%s_cascade_params.pkl" % code) else: file_utils.verify_dir("data_cache/fitted_params") print( "Saving filtered cascades to data_cache/fitted_params/%s_%s_cascade_params.pkl" % (code, filtered)) file_utils.save_pickle( cascade_params, "data_cache/fitted_params/%s_%s_cascade_params.pkl" % (code, filtered))
def get_subreddits(code, cascades = False, display = False): #no cascades, load them first if cascades == False: cascades, comments, missing_posts, missing_comments = build_cascades(code) #get distribution subreddit_dist = data_utils.dictionary_field_dist(cascades, 'subreddit') #print distribution if desired if display: for key, value in subreddit_dist.items(): print(key, value) #save distribution to json file print("Saving subreddit distribution to results/%s_post_subreddit_dist.json" % code) file_utils.verify_dir("results") file_utils.save_json(subreddit_dist, "results/%s_post_subreddit_dist.json" % code) return subreddit_dist
def check_comment_count(code, cascades = False): #load data if missing if cascades == False: print("loading data - build cascades") cascades, comments, missing_posts, missing_comments = build_cascades(code) #data dictionary: key is num_comments field, value is number of direct replies we found direct_count_dict = defaultdict(list) #and one for the total number of comments, since that might be more what they're giving us total_count_dict = defaultdict(list) #process each cascade for post_id, post in cascades.items(): direct_count_dict[post['num_comments']].append(post['comment_count_direct']) total_count_dict[post['num_comments']].append( post['comment_count_total']) #convert lists to average for key in direct_count_dict.keys(): direct_count_dict[key] = sum(direct_count_dict[key]) / len(direct_count_dict[key]) total_count_dict[key] = sum(total_count_dict[key]) / len(total_count_dict[key]) #plot results (no save for now) file_utils.verify_dir("plots") plot_utils.plot_mult_dict_data([direct_count_dict, total_count_dict], ['cascade direct replies', 'cascade total comments'], 'num_comments from data', 'comments from cascade', 'Number of Comments: Given vs Counted', filename = "plots/%s_comment_counts.png" % code)
def save_comments(code, comments, filtered=False): if filtered == False: print( "Saving comments to data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) #save all comments to pickle file_utils.verify_dir("data_cache/%s_cascades" % code) #break cyber comments into separate files, because memory error if code == "cyber": temp = {} #temporary dictionary to hold a chunk of comments count = 0 for comment_id, comment in comments.items(): temp[comment_id] = comment count += 1 if count % 1000000 == 0: file_utils.save_pickle( temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" % (code, code, count // 1000000)) temp = {} #last save file_utils.save_pickle( temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" % (code, code, count // 1000000)) else: file_utils.save_pickle( comments, "data_cache/%s_cascades/%s_cascade_comments.pkl" % (code, code)) else: file_utils.verify_dir("data_cache/filtered_cascades") print( "Saving filtered comments to data_cache/filtered_cascades/%s_%s_comments.pkl" % (code, filtered)) file_utils.save_pickle( comments, "data_cache/filtered_cascades/%s_%s_comments.pkl" % (code, filtered))
arguments_list.append('-v') #verbose output arguments_list.append('-d') #include default posts in graph #arguments_list.append('--sanity') #simulate from fitted params #arguments_list.append('--train_stats') #compute and output training set stats #arguments_list.append('--test_stats') #compute and output testing set stats #arguments_list.append('-b') #force all training node qualities to 1, so learning rate is 0 #arguments_list.append('-timestamps') #can only use this option if doing jaccard #arguments_list.append('-stopwords') #remove stopwords from titles for jaccard edge weight calc #END ARGUMENTS #make sure output dir for each subreddit exists for subreddit in subreddits: file_utils.verify_dir("sim_results/%s" % subreddit) file_utils.verify_dir("sim_results/%s/run_results" % subreddit) #prepend/append a None to size breaks list for easier looping size_breaks = [None] + size_breaks + [None] #count runs for each subreddit (hackery incoming) sub_counts = defaultdict(int) #keep list of background processes, so we can wait for them at the end background_procs = [] #outfile list #for each model, keep a list of base outfiles, so we can check the bookmarks at the end outfile_lists = { "model": [],
# paper_model.py pivx ZeuF7ZTDw3McZUOaosvXdA 5 sim_tree 250 -1 (11 comments) # paper_model.py compsci qOjspbLmJbLMVFxYbjB1mQ 200 sim_tree 250 -1 (58 comments) import file_utils import functions_paper_model import cascade_manip import fit_partial_cascade print("") #parse all command-line arguments group, input_sim_post_id, time_observed, outfile, max_nodes, min_node_quality, estimate_initial_params, batch = functions_paper_model.parse_command_args( ) #ensure working directory exists file_utils.verify_dir("sim_files") #load posts and comments for this group raw_posts, raw_comments = functions_paper_model.load_group_data(group) #ensure post id is in dataset (gets list of all post ids if running all) sim_post_id_list, random_post = functions_paper_model.verify_post_id( input_sim_post_id, batch, list(raw_posts.keys())) #if running in mode all, keep total of all metrics, dump at end if batch: total_dist = 0 total_update_count = 0 total_update_time = 0 total_insert_count = 0 total_insert_time = 0
#print some log-ish stuff in case output being piped and saved print("Input", infile) print("Output", outfile) print("Domain", domain) if min_node_quality != -1: print("Minimum node quality", min_node_quality) if estimate_initial_params: print( "Estimating initial params for seed posts based on inverse quality weighted average of neighbors" ) if sub_filter != "": print("Processing only", sub_filter, "subreddit") print("") file_utils.verify_dir("sim_files") #ensure working directory exists #load post seeds raw_post_seeds = load_reddit_seeds(infile) #convert to dictionary of subreddit->list of post objects post_seeds = defaultdict(list) for post in raw_post_seeds: #cve, group all together with subreddit set to cve if domain == "cve": post_seeds["cve"].append(post) else: post_seeds[post['subreddit']].append(post) print({key: len(post_seeds[key]) for key in post_seeds}) all_events = [] #list for all sim events, across all seed posts
#hackery: declare a special print function for verbose output if verbose: def vprint(*args): # Print each argument separately so caller doesn't need to # stuff everything to be printed into a single string for arg in args: print(arg, end='') print("") else: vprint = lambda *a: None # do-nothing function #and make sure the regular model functions have this too functions_gen_cascade_model.define_vprint(verbose) #ensure data directory for this subreddit exists - for saving posts, cascades, params, etc file_utils.verify_dir("reddit_data/%s" % subreddit) #if using socsim data, special load process (no time-defined sets) if socsim_data: socsim_data_functions_gen_cascade_model.define_vprint(verbose) #define vprint for that function class\ #load all the training and testing data for this domain train_posts, train_cascades, train_params, train_fit_fail_list, test_posts, test_cascades, test_params, test_fit_fail_list = socsim_data_functions_gen_cascade_model.load_data(subreddit) #otherwise, standard data load (use month-year and lengths to define testing set) else: #no training data, since just using observed comments for this cascade vprint("\nLoading processed testing data") #load pre-processed posts and their reconstructed cascades for testing period (no params here!) test_posts, test_cascades = functions_gen_cascade_model.load_processed_posts(subreddit, testing_start_month, testing_start_year, testing_num, load_cascades=True)
#so any overlap is redudant and can be thrown away #(yes, there are neater ways to do this, but I don't care!) for item in subreddit_dict.keys(): if len(subreddit_dict[item]) > 1: #crypto and cyber drowns out cve, so remove it if ("crypto" in subreddit_dict[item] or "cyber" in subreddit_dict[item]) and "cve" in subreddit_dict[item]: subreddit_dict[item].remove("cve") subreddit_dict[item] = subreddit_dict[item][0] #save as pickle for later print("Saving subreddit->domain mapping to", subreddits_filepath) file_utils.save_pickle(subreddit_dict, subreddits_filepath) #verify directories for output files file_utils.verify_dir("model_files/params") file_utils.verify_dir("model_files/posts") file_utils.verify_dir("model_files/graphs") file_utils.verify_dir("model_files/users") #loop all subreddits for subreddit, domain in subreddit_dict.items(): if subreddit != subreddit_filter: continue ''' if domain != "crypto": continue ''' print("\nProcessing", subreddit, "in", domain, "domain")