Пример #1
0
def top_level_comment_response_dist(code, cascades = False, comments = False, bin_minutes = 1, remove_first = True):
	#load data if missing
	if cascades == False or comments == False:
		cascades, comments, missing_posts, missing_comments = build_cascades(code)

	print("\nComputing top-level comment response time distribution")

	#response time dictionary: time in minutes -> number of responses with that delay
	response_times = defaultdict(int)

	#for each post, look at all top-level replies
	for post_id, post in cascades.items():		#loop posts
		#if this post is a dummy object, throw an error to the user and move on
		if post['placeholder']:
			print("Data contains placeholder post. Please use remove_missing to filter out incomplete cascades first.")
			exit(0)

		post_time = post['created_utc']		#grab post time to compute reply delay

		for comment_id in post['replies']:		#loop replies
			#get response time in minutes for this comment
			response_time = int((comments[comment_id]['created_utc'] - post_time) / (bin_minutes * 60.0)) * bin_minutes

			#if response time is somehow negative, throw an error message but keep running
			if response_time < 0:
				print("Warning: negative response time!")
			#add one to counter for this response time (binned by minutes)
			response_times[response_time] += 1

	#throw out first minute (bots)
	if remove_first == True:
		response_times.pop(0, None)

	#convert frequencies to probability distribution function
	total = sum(response_times.values())
	for key in response_times.keys():
		response_times[key] /= total

	#save response time distribution, but only if bin_minutes = 1
	if bin_minutes == 1:
		print("Saving top-level comment response time distribution to results/%s_top_level_comment_response_time_dist_%s_<options>.json" % (code, bin_minutes))
		file_utils.verify_dir("results")
		file_utils.save_json(response_times, "results/%s_top_level_comment_response_time_dist_%s.json" % (code, bin_minutes))

	#plot everything
	print("Plotting top-level comment response time distribution to plots/%s_top_level_comment_response_times_%s.png" % (code, bin_minutes))
	file_utils.verify_dir("plots")
	plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_log.png" % (code, bin_minutes), x_min = 0, log_scale_x = True, log_scale_y = True)
	plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_zoom_log.png" % (code, bin_minutes), x_min = 0, x_max = 60*24, log_scale_x = True, log_scale_y = True)
	plot_utils.plot_dict_data(response_times, "reply delay time (minutes)", "number of replies", "Top-Level Comment Response Time Distribution - %s Minute Bins" % bin_minutes, filename = "plots/%s_top_level_comment_response_times_%s_zoom.png" % (code, bin_minutes), x_min = 0, x_max = 60*24)
Пример #2
0
def save_cascades(code, cascades, filtered=False):
    if filtered == False:
        file_utils.verify_dir("data_cache/%s_cascades" % code)
        print(
            "Saving cascades to data_cache/%s_cascades/%s_cascade_posts.pkl" %
            (code, code))
        file_utils.save_pickle(
            cascades,
            "data_cache/%s_cascades/%s_cascade_posts.pkl" % (code, code))
    else:
        file_utils.verify_dir("data_cache/filtered_cascades")
        print(
            "Saving filtered cascades to data_cache/filtered_cascades/%s_%s_cascades.pkl"
            % (code, filtered))
        file_utils.save_pickle(
            cascades, "data_cache/filtered_cascades/%s_%s_cascades.pkl" %
            (code, filtered))
Пример #3
0
def save_cascade_params(code, cascade_params, filtered=False):
    if filtered == False:
        file_utils.verify_dir("data_cache/fitted_params/")
        print(
            "Saving cascade params to data_cache/fitted_params/%s_cascade_params.pkl"
            % code)
        file_utils.save_pickle(
            cascade_params,
            "data_cache/fitted_params/%s_cascade_params.pkl" % code)
    else:
        file_utils.verify_dir("data_cache/fitted_params")
        print(
            "Saving filtered cascades to data_cache/fitted_params/%s_%s_cascade_params.pkl"
            % (code, filtered))
        file_utils.save_pickle(
            cascade_params,
            "data_cache/fitted_params/%s_%s_cascade_params.pkl" %
            (code, filtered))
Пример #4
0
def get_subreddits(code, cascades = False, display = False):
	#no cascades, load them first
	if cascades == False:
		cascades, comments, missing_posts, missing_comments = build_cascades(code)

	#get distribution
	subreddit_dist = data_utils.dictionary_field_dist(cascades, 'subreddit')

	#print distribution if desired
	if display:
		for key, value in subreddit_dist.items():
			print(key, value)

	#save distribution to json file
	print("Saving subreddit distribution to results/%s_post_subreddit_dist.json" % code)
	file_utils.verify_dir("results")
	file_utils.save_json(subreddit_dist, "results/%s_post_subreddit_dist.json" % code)

	return subreddit_dist
Пример #5
0
def check_comment_count(code, cascades = False):
	#load data if missing
	if cascades == False:
		print("loading data - build cascades")
		cascades, comments, missing_posts, missing_comments = build_cascades(code)

	#data dictionary: key is num_comments field, value is number of direct replies we found
	direct_count_dict = defaultdict(list)
	#and one for the total number of comments, since that might be more what they're giving us
	total_count_dict = defaultdict(list)

	#process each cascade
	for post_id, post in cascades.items():
		direct_count_dict[post['num_comments']].append(post['comment_count_direct'])
		total_count_dict[post['num_comments']].append( post['comment_count_total'])

	#convert lists to average
	for key in direct_count_dict.keys():
		direct_count_dict[key] = sum(direct_count_dict[key]) / len(direct_count_dict[key])
		total_count_dict[key] = sum(total_count_dict[key]) / len(total_count_dict[key])

	#plot results (no save for now)
	file_utils.verify_dir("plots")
	plot_utils.plot_mult_dict_data([direct_count_dict, total_count_dict], ['cascade direct replies', 'cascade total comments'], 'num_comments from data', 'comments from cascade', 'Number of Comments: Given vs Counted', filename = "plots/%s_comment_counts.png" % code)
Пример #6
0
def save_comments(code, comments, filtered=False):
    if filtered == False:
        print(
            "Saving comments to data_cache/%s_cascades/%s_cascade_comments.pkl"
            % (code, code))
        #save all comments to pickle
        file_utils.verify_dir("data_cache/%s_cascades" % code)
        #break cyber comments into separate files, because memory error
        if code == "cyber":
            temp = {}  #temporary dictionary to hold a chunk of comments
            count = 0
            for comment_id, comment in comments.items():
                temp[comment_id] = comment
                count += 1
                if count % 1000000 == 0:
                    file_utils.save_pickle(
                        temp,
                        "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" %
                        (code, code, count // 1000000))
                    temp = {}
            #last save
            file_utils.save_pickle(
                temp, "data_cache/%s_cascades/%s_cascade_comments_%s.pkl" %
                (code, code, count // 1000000))
        else:
            file_utils.save_pickle(
                comments, "data_cache/%s_cascades/%s_cascade_comments.pkl" %
                (code, code))
    else:
        file_utils.verify_dir("data_cache/filtered_cascades")
        print(
            "Saving filtered comments to data_cache/filtered_cascades/%s_%s_comments.pkl"
            % (code, filtered))
        file_utils.save_pickle(
            comments, "data_cache/filtered_cascades/%s_%s_comments.pkl" %
            (code, filtered))
Пример #7
0
arguments_list.append('-v')  #verbose output
arguments_list.append('-d')  #include default posts in graph
#arguments_list.append('--sanity')	#simulate from fitted params
#arguments_list.append('--train_stats')		#compute and output training set stats
#arguments_list.append('--test_stats')		#compute and output testing set stats
#arguments_list.append('-b')		#force all training node qualities to 1, so learning rate is 0
#arguments_list.append('-timestamps')

#can only use this option if doing jaccard
#arguments_list.append('-stopwords')			#remove stopwords from titles for jaccard edge weight calc

#END ARGUMENTS

#make sure output dir for each subreddit exists
for subreddit in subreddits:
    file_utils.verify_dir("sim_results/%s" % subreddit)
    file_utils.verify_dir("sim_results/%s/run_results" % subreddit)

#prepend/append a None to size breaks list for easier looping
size_breaks = [None] + size_breaks + [None]

#count runs for each subreddit (hackery incoming)
sub_counts = defaultdict(int)

#keep list of background processes, so we can wait for them at the end
background_procs = []

#outfile list
#for each model, keep a list of base outfiles, so we can check the bookmarks at the end
outfile_lists = {
    "model": [],
Пример #8
0
#	paper_model.py pivx ZeuF7ZTDw3McZUOaosvXdA 5 sim_tree 250 -1					(11 comments)
#	paper_model.py compsci qOjspbLmJbLMVFxYbjB1mQ 200 sim_tree 250 -1				(58 comments)

import file_utils
import functions_paper_model
import cascade_manip
import fit_partial_cascade

print("")

#parse all command-line arguments
group, input_sim_post_id, time_observed, outfile, max_nodes, min_node_quality, estimate_initial_params, batch = functions_paper_model.parse_command_args(
)

#ensure working directory exists
file_utils.verify_dir("sim_files")

#load posts and comments for this group
raw_posts, raw_comments = functions_paper_model.load_group_data(group)

#ensure post id is in dataset (gets list of all post ids if running all)
sim_post_id_list, random_post = functions_paper_model.verify_post_id(
    input_sim_post_id, batch, list(raw_posts.keys()))

#if running in mode all, keep total of all metrics, dump at end
if batch:
    total_dist = 0
    total_update_count = 0
    total_update_time = 0
    total_insert_count = 0
    total_insert_time = 0
Пример #9
0
#print some log-ish stuff in case output being piped and saved
print("Input", infile)
print("Output", outfile)
print("Domain", domain)
if min_node_quality != -1:
    print("Minimum node quality", min_node_quality)
if estimate_initial_params:
    print(
        "Estimating initial params for seed posts based on inverse quality weighted average of neighbors"
    )
if sub_filter != "":
    print("Processing only", sub_filter, "subreddit")
print("")

file_utils.verify_dir("sim_files")  #ensure working directory exists

#load post seeds
raw_post_seeds = load_reddit_seeds(infile)

#convert to dictionary of subreddit->list of post objects
post_seeds = defaultdict(list)
for post in raw_post_seeds:
    #cve, group all together with subreddit set to cve
    if domain == "cve":
        post_seeds["cve"].append(post)
    else:
        post_seeds[post['subreddit']].append(post)
print({key: len(post_seeds[key]) for key in post_seeds})

all_events = []  #list for all sim events, across all seed posts
Пример #10
0
#hackery: declare a special print function for verbose output
if verbose:
	def vprint(*args):
		# Print each argument separately so caller doesn't need to
		# stuff everything to be printed into a single string
		for arg in args:
			print(arg, end='')
		print("")
else:   
	vprint = lambda *a: None      # do-nothing function
#and make sure the regular model functions have this too
functions_gen_cascade_model.define_vprint(verbose)	

#ensure data directory for this subreddit exists - for saving posts, cascades, params, etc
file_utils.verify_dir("reddit_data/%s" % subreddit)

#if using socsim data, special load process (no time-defined sets)
if socsim_data:
	socsim_data_functions_gen_cascade_model.define_vprint(verbose)		#define vprint for that function class\
	#load all the training and testing data for this domain
	train_posts, train_cascades, train_params, train_fit_fail_list, test_posts, test_cascades, test_params, test_fit_fail_list = socsim_data_functions_gen_cascade_model.load_data(subreddit)

#otherwise, standard data load (use month-year and lengths to define testing set)
else:
	#no training data, since just using observed comments for this cascade

	vprint("\nLoading processed testing data")
	#load pre-processed posts and their reconstructed cascades for testing period (no params here!)
	test_posts, test_cascades = functions_gen_cascade_model.load_processed_posts(subreddit, testing_start_month, testing_start_year, testing_num, load_cascades=True)
Пример #11
0
    #so any overlap is redudant and can be thrown away
    #(yes, there are neater ways to do this, but I don't care!)
    for item in subreddit_dict.keys():
        if len(subreddit_dict[item]) > 1:
            #crypto and cyber drowns out cve, so remove it
            if ("crypto" in subreddit_dict[item] or "cyber"
                    in subreddit_dict[item]) and "cve" in subreddit_dict[item]:
                subreddit_dict[item].remove("cve")
        subreddit_dict[item] = subreddit_dict[item][0]

    #save as pickle for later
    print("Saving subreddit->domain mapping to", subreddits_filepath)
    file_utils.save_pickle(subreddit_dict, subreddits_filepath)

#verify directories for output files
file_utils.verify_dir("model_files/params")
file_utils.verify_dir("model_files/posts")
file_utils.verify_dir("model_files/graphs")
file_utils.verify_dir("model_files/users")

#loop all subreddits
for subreddit, domain in subreddit_dict.items():

    if subreddit != subreddit_filter:
        continue
    '''
	if domain != "crypto":
		continue
	'''

    print("\nProcessing", subreddit, "in", domain, "domain")