def influence(la): split_timestamp = int(time.mktime(datetime.datetime.strptime("2013/01/01", "%Y/%m/%d").timetuple())) time_diff = 86400 influence_tuples = la.estimate_influencer_effect(1, split_timestamp, time_diff, control_divider=0.1, selection_method="random", klim=5) mean_fr_influence = utils.mean_sd([v[5] for v in influence_tuples]) mean_nonfr_influence = utils.mean_sd([v[6] for v in influence_tuples]) print mean_fr_influence, mean_nonfr_influence
def locality(la): items_cov_list, items_popularity, cov_ratio_list, degree_distr = la.compare_items_edge_coverage(1, minimum_interactions=1) print utils.mean_sd(items_cov_list) print utils.mean_sd(items_popularity) plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Common Likes with friends to total popularity", "Frequency (Number of items)", logyscale=True, bins=20) #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #plotter.plotHist(sorted(items_popularity), "Item", "total popularity") plotter.plotCumulativePopularity(items_popularity, labelx="Item percentile", labely="Cum. percent of number of likes") f_out = open('plots/data/influenced_loves_ratio.tsv', 'w') for i in range(len(items_cov_list)): f_out.write(str(items_cov_list[i])+' '+str(items_popularity[i])+' '+str(cov_ratio_list[i])+'\n') f_out.close()
def influence(la): split_timestamp = int( time.mktime( datetime.datetime.strptime("2013/01/01", "%Y/%m/%d").timetuple())) time_diff = 86400 influence_tuples = la.estimate_influencer_effect(1, split_timestamp, time_diff, control_divider=0.1, selection_method="random", klim=5) mean_fr_influence = utils.mean_sd([v[5] for v in influence_tuples]) mean_nonfr_influence = utils.mean_sd([v[6] for v in influence_tuples]) print mean_fr_influence, mean_nonfr_influence
def locality(la): items_cov_list, items_popularity, cov_ratio_list, degree_distr = la.compare_items_edge_coverage( 1, minimum_interactions=1) print utils.mean_sd(items_cov_list) print utils.mean_sd(items_popularity) plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Common Likes with friends to total popularity", "Frequency (Number of items)", logyscale=True, bins=20) #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #plotter.plotHist(sorted(items_popularity), "Item", "total popularity") plotter.plotCumulativePopularity(items_popularity, labelx="Item percentile", labely="Cum. percent of number of likes") f_out = open('plots/data/influenced_loves_ratio.tsv', 'w') for i in range(len(items_cov_list)): f_out.write( str(items_cov_list[i]) + ' ' + str(items_popularity[i]) + ' ' + str(cov_ratio_list[i]) + '\n') f_out.close()
def test_influence(la, interact_type, time_diff, time_scale, split_timestamp, control_divider, min_interactions_beforeaftersplit_per_user, max_tries, max_node_computes, num_threads, max_interact_ratio_error,max_sim_ratio_error, min_friends_match_ratio, klim, nonfr_match, method, allow_duplicates): #time_diff = 90000 #86400 influence_tuples = la.estimate_influencer_effect_parallel(interact_type, split_timestamp, time_diff, time_scale, control_divider=control_divider, min_interactions_beforeaftersplit_per_user=min_interactions_beforeaftersplit_per_user, selection_method="random", klim=klim, max_tries=max_tries, max_node_computes=max_node_computes, num_threads=num_threads, max_interact_ratio_error=max_interact_ratio_error, max_sim_ratio_error=max_sim_ratio_error, min_friends_match_ratio=min_friends_match_ratio, nonfr_match = nonfr_match, method=method, allow_duplicates=allow_duplicates) node_test_set_sizes = [v[0] for v in influence_tuples] fr_inf_vals = [v[5] for v in influence_tuples] nonfr_inf_vals = [v[6] for v in influence_tuples] diff_inf_vals = [v[5]-v[6] for v in influence_tuples] # Considering only positive values diff_inf_vals_positiveonly = [v[5]-v[6] if v[5]-v[6] > 0 else 0 for v in influence_tuples] mean_fr_influence = utils.mean_sd(fr_inf_vals) mean_nonfr_influence = utils.mean_sd(nonfr_inf_vals) mean_diff_influence = utils.mean_sd(diff_inf_vals) mean_diff_influence_positiveonly = utils.mean_sd(diff_inf_vals_positiveonly) fr_sim_vals = [v[3] for v in influence_tuples] nonfr_sim_vals = [v[4] for v in influence_tuples] diff_sim_vals = [v[3]-v[4] for v in influence_tuples] mean_fr_sim = utils.mean_sd(fr_sim_vals) mean_nonfr_sim = utils.mean_sd(nonfr_sim_vals) mean_diff_sim = utils.mean_sd(diff_sim_vals) print "\nTest Results" print "Mean FrSim={0}, Mean NonFrSim={1}, MeanDiff={2}".format(mean_fr_sim, mean_nonfr_sim, mean_diff_sim) print "MeanFrInf={0}, Mean NonFrInf={1}".format(mean_fr_influence, mean_nonfr_influence) print "MeanDiff={0}, MeanDiffPositiveOnly={1}".format(mean_diff_influence, mean_diff_influence_positiveonly) print len(fr_inf_vals), len(nonfr_inf_vals) return node_test_set_sizes, fr_sim_vals, nonfr_sim_vals, fr_inf_vals, nonfr_inf_vals
def show_basic_stats(self): num_interactdata_users = sum([1 for _ in self.netdata.get_nodes_iterable(should_have_interactions=True)]) print "Number of Users with interaction data", num_interactdata_users num_frienddata_users = sum([1 for _ in self.netdata.get_nodes_iterable(should_have_friends=True)]) print "Number of Users with friendship data", num_frienddata_users num_overall_users = self.netdata.get_total_num_nodes() print "Number of overall users", num_overall_users fr_arr = [v.get_num_friends() for v in self.netdata.get_nodes_iterable(should_have_friends=True)] print "Mean, SD of number of friends per user", mean_sd(fr_arr) print "Number of users with zero friends", sum([1 for v in fr_arr if v == 0]) num_items_all = self.netdata.get_total_num_items() print "Total number of items", num_items_all print "Types of interactions with items", self.netdata.interaction_types items_by_interaction = [] for interact_type in self.netdata.interaction_types: items_by_interaction.append(set()) for v in self.netdata.get_nodes_iterable(should_have_interactions=True): for interact_type in self.netdata.interaction_types: items_by_interaction[interact_type] |= set(v.get_items_interacted_with(interact_type)) for i in range(len(items_by_interaction)): print( "--Total number of items with interaction %s: %d" %(i, len(items_by_interaction[i])) ) sum_each_interaction_dict = self.get_sum_interactions_by_type() for interact_type, total_interacts in sum_each_interaction_dict.iteritems(): #print(interact_type) print( "--Total, Mean, of %d-type interactions per user = (%d, %f" %(interact_type, total_interacts, total_interacts/float(num_interactdata_users))) print( "--Total, Mean, of %d-type interactions per item = (%d, %f)" %(interact_type, total_interacts, total_interacts/float(num_items_all)) ) for interact_type in self.netdata.interaction_types: user_interacts = [ v.get_num_interactions(interact_type) for v in self.netdata.get_nodes_iterable(should_have_interactions=True)] print("--Min, Max %d-type interacts per user= %d, %d" %(interact_type, min(user_interacts), max(user_interacts))) print "Min., Max. timestamp of interactions", self.get_min_max_interact_times() return
def run_computation(data, computation_cmd, outf, interact_type, create_fake_prefs, allow_duplicates, split_date_str, dataset_domain, dataset_path, min_interacts_beforeaftersplit_per_user, max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio, traindata_fraction, M): net_analyzer = BasicNetworkAnalyzer(data) interaction_types = data.interact_types_dict filename_prefix = computation_cmd if computation_cmd is not None else "" if computation_cmd=="basic_stats" or computation_cmd is None: net_analyzer.show_basic_stats() ## use below if you want to write a new dataset (e.g. after filtering) data.store_ego_dataset("/home/amit/datasets/social_activity_data/lastfm_filtered_listen/", write_maps=False) #data.compute_allpairs_sim(interact_type, data_type=ord("a")) elif computation_cmd=="random_similarity": for type_name, type_index in interaction_types.iteritems(): circlesims, globalsims = net_analyzer.compare_circle_global_similarity(type_index, num_random_trials=5, cutoff_rating=cutoff_rating) #plotter.plotLinesYY(circlesims, globalsims, "Friends", "Global") outf.write("User_id\tcircle_sim\tnonfriend_sim\n") outf.write(type_name + '\n') for ind in range(len(circlesims)): outf.write("%s\t%f\t%f\n" %(circlesims[ind][0], circlesims[ind][1], globalsims[ind][1])) print "\n", type_name, ":" print "Circle Average", sum([v2 for v1,v2 in circlesims])/float(len(circlesims)) print "Global Average", sum([v2 for v1,v2 in globalsims])/float(len(globalsims)) elif computation_cmd=="knn_similarity": #Compute K-nearest similarity KLIMITS = [10] outf.write("User_id\tk\tcircle_sim\tnonfriend_sim\n") for type_name, type_index in interaction_types.iteritems(): for curr_lim in KLIMITS: plot_circle, plot_external = net_analyzer.compare_circle_global_knnsimilarity(type_index, klim=curr_lim, cutoff_rating=cutoff_rating) compare_sims(plot_circle, plot_external) outf.write(type_name+'\n') for ind in range(len(plot_circle)): outf.write("%s\t%d\t%f\t%f\n" %(plot_circle[ind][0], curr_lim, plot_circle[ind][1], plot_external[ind][1])) #plotter.plotLinesYY(plot_circle, plot_external, "Friends", "Global") print type_name, "K", curr_lim print "Circle Average", utils.mean_sd([v2 for v1,v2 in plot_circle]), len(plot_circle) print "Global Average", utils.mean_sd([v2 for v1,v2 in plot_external]), len(plot_external) elif computation_cmd=="knn_recommender": #Compute K-nearest recommender KLIMITS = [10] rec_analyzer = RecommenderAnalyzer(data, max_recs_shown=10, traintest_split=0.7, cutoff_rating=cutoff_rating) outf.write("User_id\tk\trun_index\tcircle_ndcg\tnonfriend_ndcg\n") for type_name, type_index in interaction_types.iteritems(): for curr_lim in KLIMITS: local_avg=[] global_avg=[] Ntotal = 10 for i in range(Ntotal): # randomize because of training-test split. plot_circle, plot_external = rec_analyzer.compare_knearest_recommenders(type_index, klim=curr_lim, num_processes=2) compare_sims(plot_circle, plot_external) outf.write(type_name + "\n") for ind in range(len(plot_circle)): outf.write("%s\t%d\t%d\t%f\t%f\n" %(plot_circle[ind][0], curr_lim, i, plot_circle[ind][1], plot_external[ind][1])) print "\n", type_name, "K", curr_lim #print plot_circle, plot_external curr_avg_local = utils.mean_sd([v2 for v1,v2 in plot_circle]) curr_avg_global = utils.mean_sd([v2 for v1,v2 in plot_external]) print "Circle Average", curr_avg_local print "Global Average", curr_avg_global local_avg.append(curr_avg_local[0]) global_avg.append(curr_avg_global[0]) #plotLinesYY(plot_circle, plot_external, "Friends", "Global") print "Local", sum(local_avg)/float(Ntotal) print "Global", sum(global_avg)/float(Ntotal) elif computation_cmd == "circle_coverage": lim_friends = [(5,10), (10,20), (20,50), (50,100)] for fr_limit in lim_friends: locality_analyzer = LocalityAnalyzer(data) coverage_list = locality_analyzer.compare_circle_item_coverages(0, fr_limit[0], fr_limit[1]) plotter.plotLineY(sorted(coverage_list), "User", "Fraction of Items Covered with %d-%d friends" % (fr_limit[0], fr_limit[1])) print utils.mean_sd(coverage_list) elif computation_cmd == "items_edge_coverage": locality_analyzer = LocalityAnalyzer(data) items_cov_list, items_popularity, cov_ratio_list = locality_analyzer.compare_items_edge_coverage(1, minimum_interactions=1) print utils.mean_sd(items_cov_list) print utils.mean_sd(items_popularity) #plotter.plotHist(sorted([val for val in cov_ratio_list if val<=1]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #plotter.plotHist(sorted(items_popularity), "Item", "total popularity") plotter.plotCumulativePopularity(items_popularity, labelx="Item percentile", labely="Cum. percent of number of likes") elif computation_cmd == "network_draw": net_visualizor = NetworkVisualizor(data) net_visualizor.draw_network() elif computation_cmd == "network_item_adopt": net_visualizor = NetworkVisualizor(data) pprint(net_visualizor.plot_item_adoption(1669118)) elif computation_cmd == "node_details": for node_id in open('user_ids'): if node_id.strip('\n') != "User_id": net_analyzer.get_node_details(int(node_id.strip('\n'))) elif computation_cmd=="store_dataset": user_interacts = net_analyzer.get_user_interacts(1, cutoff_rating) f = open(outf_path+ 'user_interacts_'+dataset_domain+'.tsv', 'w') f.write("user_id\titem_id\ttimestamp\n") for user_id, item_id, timestamp in user_interacts: f.write("%s\t%s\t%s\n" %(user_id, item_id, timestamp)) f.close() item_pop = net_analyzer.get_items_popularity(1, cutoff_rating) f = open(outf_path+'items_'+dataset_domain+'.tsv','w') f.write("item_id\tpopularity\n") for item_id, pop in item_pop.iteritems(): f.write("%s\t%s\n" %(item_id, pop)) f.close() user_friends = net_analyzer.get_user_friends() f = open('user_friends_'+dataset_domain+'.tsv','w') f.write("user_id\tfriend_id\n") for user_id, friend_id in user_friends: f.write("%s\t%s\n" %(user_id, friend_id)) f.close() print "Successfully stored tsv dataset" elif computation_cmd=="compare_interact_types": num_interacts_dict = net_analyzer.compare_interaction_types() interact_types = num_interacts_dict.keys() plotter.plotLinesYY(num_interacts_dict[interact_types[0]], num_interacts_dict[interact_types[1]], interact_types[0], interact_types[1], display=True, logyscale=True) plotter.plotLinesYY(num_interacts_dict[interact_types[1]], num_interacts_dict[interact_types[2]], interact_types[1], interact_types[2], display=True, logyscale=True) plotter.plotLinesYY(num_interacts_dict[interact_types[0]], num_interacts_dict[interact_types[2]], interact_types[0], interact_types[2], display=True, logyscale=True) elif computation_cmd=="influence_test": # ta = TemporalAnalyzer(data) #interact_type = data.interact_types_dict["listen" # time_scale can be 'w':wallclock_time or 'o':ordinal_time split_date_str = "2008/01/01" t_window = -1 t_scale = ord('w') max_tries_val = 10000 max_node_computes_val = 100 max_interact_ratio_error = 0.1 klim_val=5 split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) # crate trainig test sets that will be used by fake geernation data.create_training_test_bytime(interact_type, split_timestamp) if create_fake_prefs is not None: print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences(data,interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) #fake_data.generate_random_preferences(data, interact_type, split_timestamp) print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) # Need to generate again because fake data changes test data data.create_training_test_bytime(interact_type, split_timestamp) la = LocalityAnalyzer(data) inf_tuple = compute.test_influence(la, interact_type=interact_type, time_diff=t_window, time_scale=ord('w'), split_timestamp=split_timestamp, #time_diff=100000, split_date_str="1970/06/23", control_divider=0.01, min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user, max_tries = max_tries_val, max_node_computes=max_node_computes_val, num_processes=4, max_interact_ratio_error=max_interact_ratio_error, klim=klim_val, method="influence") print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3]) num_vals = len(inf_tuple[0]) f = open("influence_test", "w") for i in range(num_vals): f.write("%f\t%f\t%f\t%f\n" % (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i], inf_tuple[3][i])) f.close() elif computation_cmd=="suscept_test": use_artists = "songs" if "songs" in dataset_path else "artists" interact_type_str = "listen" if interact_type==0 else "love" #M = [50]#,20]#,30,40,50] t_scale = ord('o') # ordinal scale, this is the default used in paper. NUM_NODES_TO_COMPUTE = 4000000 # maximum number nodes to compute? num_threads=4 # the number of threads to spawn max_tries_val = None#30000 # should we stop after max_tries? max_node_computes_val = NUM_NODES_TO_COMPUTE/num_threads # number of nodes to compute at each node #max_interact_ratio_error =0.2 # these are errors (defaults are 0.1,0.1) #max_sim_ratio_error = 0.2 #min_friends_match_ratio = 0.5 # important to be 1 for simulation--because e.g. in influence, we use a person's all friends to compute his next like klim_val = None # not used for influence test nonfr_match = "random" #random, serial, kbest. Default is random. num_loop = 1 # number of times we calculate this. For averaging results over multiple runs. f = open("suscept_test_results/"+dataset_domain + dataset_path.split("/")[-2] + interact_type_str+ strftime("%Y-%m-%d_%H:%M:%S")+'.dat', 'w') f.write("# use_artists=%r\tallow_duplicates=%r\tmax_node_computes_val=%d\tcreate_fake_prefs=%r\tnum_loop=%d\n" % ( use_artists, allow_duplicates, max_node_computes_val, create_fake_prefs, num_loop)) f.write("# split_train_test_date=%s\ttime_scale=%d\tmin_interactions_beforeaftersplit_per_user=%d\tnum_threads=%d\n" % ( split_date_str, t_scale, min_interacts_beforeaftersplit_per_user, num_threads)) f.write("# max_interact_ratio_error=%f\tmax_sim_ratio_error=%f\tmin_friends_match_ratio=%f\n" %( max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio )) for t_window in M: for h in range(num_loop): f.write("\n\n################### ALERTINFO: STARTING ITERATION %d with M=%d\n" %( h, t_window)) if split_date_str=="test": split_timestamp = 2000 else: split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) #split_timestamp=25000000 if create_fake_prefs is not None: data.create_training_test_bytime(interact_type, split_timestamp) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences(data,interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) # Need to generate again because fake data changes test data data.create_training_test_bytime(interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user) la = LocalityAnalyzer(data) inf_tuple = compute.test_influence(la, interact_type=interact_type, time_diff=t_window, time_scale=t_scale, split_timestamp=split_timestamp, #time_diff=100000, split_date_str="1970/06/23", control_divider=0.01, # not used anymore min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user, max_tries = max_tries_val, max_node_computes=max_node_computes_val, num_threads=num_threads, max_interact_ratio_error = max_interact_ratio_error, max_sim_ratio_error = max_sim_ratio_error, min_friends_match_ratio=min_friends_match_ratio, klim = klim_val, nonfr_match=nonfr_match, method="suscept", allow_duplicates=allow_duplicates) print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3]) num_vals = len(inf_tuple[0]) f.write("TestSetSize\tFrSimilarity\tNonFrSimilarity\tFrOverlap\tNonFrOverlap\tRandom_run_no\tM\n") for i in range(num_vals): f.write("%d\t%f\t%f\t%f\t%f\t%d\t%d\n" % (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i], inf_tuple[3][i], inf_tuple[4][i], h, t_window)) f.close() elif computation_cmd=="gen_adopt_data": t_window = 100 t_scale = ord('o') if split_date_str=="test": split_timestamp = 2000 else: split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) if create_fake_prefs is not None: data.create_training_test_bytime(interact_type, split_timestamp) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences(data,interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) data.create_training_test_bytime(interact_type, split_timestamp) gen_adopt.generate_adoption_data(data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale) elif computation_cmd=="compute_split_date": ret_timestamp = compute.compute_cutoff_date(data, interact_type, traindata_fraction) print ret_timestamp print datetime.datetime.fromtimestamp(ret_timestamp*86400).strftime("%Y-%m-%d") """
def run_computation(data, computation_cmd, outf, interact_type, create_fake_prefs, allow_duplicates, split_date_str, dataset_domain, dataset_path, min_interacts_beforeaftersplit_per_user, max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio, traindata_fraction, M): net_analyzer = BasicNetworkAnalyzer(data) interaction_types = data.interact_types_dict filename_prefix = computation_cmd if computation_cmd is not None else "" if computation_cmd == "basic_stats" or computation_cmd is None: net_analyzer.show_basic_stats() ## use below if you want to write a new dataset (e.g. after filtering) data.store_ego_dataset( "/home/amit/datasets/social_activity_data/lastfm_filtered_listen/", write_maps=False) #data.compute_allpairs_sim(interact_type, data_type=ord("a")) elif computation_cmd == "random_similarity": for type_name, type_index in interaction_types.iteritems(): circlesims, globalsims = net_analyzer.compare_circle_global_similarity( type_index, num_random_trials=5, cutoff_rating=cutoff_rating) #plotter.plotLinesYY(circlesims, globalsims, "Friends", "Global") outf.write("User_id\tcircle_sim\tnonfriend_sim\n") outf.write(type_name + '\n') for ind in range(len(circlesims)): outf.write("%s\t%f\t%f\n" % (circlesims[ind][0], circlesims[ind][1], globalsims[ind][1])) print "\n", type_name, ":" print "Circle Average", sum([v2 for v1, v2 in circlesims]) / float( len(circlesims)) print "Global Average", sum([v2 for v1, v2 in globalsims]) / float( len(globalsims)) elif computation_cmd == "knn_similarity": #Compute K-nearest similarity KLIMITS = [10] outf.write("User_id\tk\tcircle_sim\tnonfriend_sim\n") for type_name, type_index in interaction_types.iteritems(): for curr_lim in KLIMITS: plot_circle, plot_external = net_analyzer.compare_circle_global_knnsimilarity( type_index, klim=curr_lim, cutoff_rating=cutoff_rating) compare_sims(plot_circle, plot_external) outf.write(type_name + '\n') for ind in range(len(plot_circle)): outf.write("%s\t%d\t%f\t%f\n" % (plot_circle[ind][0], curr_lim, plot_circle[ind][1], plot_external[ind][1])) #plotter.plotLinesYY(plot_circle, plot_external, "Friends", "Global") print type_name, "K", curr_lim print "Circle Average", utils.mean_sd( [v2 for v1, v2 in plot_circle]), len(plot_circle) print "Global Average", utils.mean_sd( [v2 for v1, v2 in plot_external]), len(plot_external) elif computation_cmd == "knn_recommender": #Compute K-nearest recommender KLIMITS = [10] rec_analyzer = RecommenderAnalyzer(data, max_recs_shown=10, traintest_split=0.7, cutoff_rating=cutoff_rating) outf.write("User_id\tk\trun_index\tcircle_ndcg\tnonfriend_ndcg\n") for type_name, type_index in interaction_types.iteritems(): for curr_lim in KLIMITS: local_avg = [] global_avg = [] Ntotal = 10 for i in range( Ntotal): # randomize because of training-test split. plot_circle, plot_external = rec_analyzer.compare_knearest_recommenders( type_index, klim=curr_lim, num_processes=2) compare_sims(plot_circle, plot_external) outf.write(type_name + "\n") for ind in range(len(plot_circle)): outf.write( "%s\t%d\t%d\t%f\t%f\n" % (plot_circle[ind][0], curr_lim, i, plot_circle[ind][1], plot_external[ind][1])) print "\n", type_name, "K", curr_lim #print plot_circle, plot_external curr_avg_local = utils.mean_sd( [v2 for v1, v2 in plot_circle]) curr_avg_global = utils.mean_sd( [v2 for v1, v2 in plot_external]) print "Circle Average", curr_avg_local print "Global Average", curr_avg_global local_avg.append(curr_avg_local[0]) global_avg.append(curr_avg_global[0]) #plotLinesYY(plot_circle, plot_external, "Friends", "Global") print "Local", sum(local_avg) / float(Ntotal) print "Global", sum(global_avg) / float(Ntotal) elif computation_cmd == "circle_coverage": lim_friends = [(5, 10), (10, 20), (20, 50), (50, 100)] for fr_limit in lim_friends: locality_analyzer = LocalityAnalyzer(data) coverage_list = locality_analyzer.compare_circle_item_coverages( 0, fr_limit[0], fr_limit[1]) plotter.plotLineY( sorted(coverage_list), "User", "Fraction of Items Covered with %d-%d friends" % (fr_limit[0], fr_limit[1])) print utils.mean_sd(coverage_list) elif computation_cmd == "items_edge_coverage": locality_analyzer = LocalityAnalyzer(data) items_cov_list, items_popularity, cov_ratio_list = locality_analyzer.compare_items_edge_coverage( 1, minimum_interactions=1) print utils.mean_sd(items_cov_list) print utils.mean_sd(items_popularity) #plotter.plotHist(sorted([val for val in cov_ratio_list if val<=1]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #plotter.plotHist(sorted(items_popularity), "Item", "total popularity") plotter.plotCumulativePopularity( items_popularity, labelx="Item percentile", labely="Cum. percent of number of likes") elif computation_cmd == "network_draw": net_visualizor = NetworkVisualizor(data) net_visualizor.draw_network() elif computation_cmd == "network_item_adopt": net_visualizor = NetworkVisualizor(data) pprint(net_visualizor.plot_item_adoption(1669118)) elif computation_cmd == "node_details": for node_id in open('user_ids'): if node_id.strip('\n') != "User_id": net_analyzer.get_node_details(int(node_id.strip('\n'))) elif computation_cmd == "store_dataset": user_interacts = net_analyzer.get_user_interacts(1, cutoff_rating) f = open(outf_path + 'user_interacts_' + dataset_domain + '.tsv', 'w') f.write("user_id\titem_id\ttimestamp\n") for user_id, item_id, timestamp in user_interacts: f.write("%s\t%s\t%s\n" % (user_id, item_id, timestamp)) f.close() item_pop = net_analyzer.get_items_popularity(1, cutoff_rating) f = open(outf_path + 'items_' + dataset_domain + '.tsv', 'w') f.write("item_id\tpopularity\n") for item_id, pop in item_pop.iteritems(): f.write("%s\t%s\n" % (item_id, pop)) f.close() user_friends = net_analyzer.get_user_friends() f = open('user_friends_' + dataset_domain + '.tsv', 'w') f.write("user_id\tfriend_id\n") for user_id, friend_id in user_friends: f.write("%s\t%s\n" % (user_id, friend_id)) f.close() print "Successfully stored tsv dataset" elif computation_cmd == "compare_interact_types": num_interacts_dict = net_analyzer.compare_interaction_types() interact_types = num_interacts_dict.keys() plotter.plotLinesYY(num_interacts_dict[interact_types[0]], num_interacts_dict[interact_types[1]], interact_types[0], interact_types[1], display=True, logyscale=True) plotter.plotLinesYY(num_interacts_dict[interact_types[1]], num_interacts_dict[interact_types[2]], interact_types[1], interact_types[2], display=True, logyscale=True) plotter.plotLinesYY(num_interacts_dict[interact_types[0]], num_interacts_dict[interact_types[2]], interact_types[0], interact_types[2], display=True, logyscale=True) elif computation_cmd == "influence_test": # ta = TemporalAnalyzer(data) #interact_type = data.interact_types_dict["listen" # time_scale can be 'w':wallclock_time or 'o':ordinal_time split_date_str = "2008/01/01" t_window = -1 t_scale = ord('w') max_tries_val = 10000 max_node_computes_val = 100 max_interact_ratio_error = 0.1 klim_val = 5 split_timestamp = int( time.mktime( datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) # crate trainig test sets that will be used by fake geernation data.create_training_test_bytime(interact_type, split_timestamp) if create_fake_prefs is not None: print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences( data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) #fake_data.generate_random_preferences(data, interact_type, split_timestamp) print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) # Need to generate again because fake data changes test data data.create_training_test_bytime(interact_type, split_timestamp) la = LocalityAnalyzer(data) inf_tuple = compute.test_influence( la, interact_type=interact_type, time_diff=t_window, time_scale=ord('w'), split_timestamp=split_timestamp, #time_diff=100000, split_date_str="1970/06/23", control_divider=0.01, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, max_tries=max_tries_val, max_node_computes=max_node_computes_val, num_processes=4, max_interact_ratio_error=max_interact_ratio_error, klim=klim_val, method="influence") print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3]) num_vals = len(inf_tuple[0]) f = open("influence_test", "w") for i in range(num_vals): f.write("%f\t%f\t%f\t%f\n" % (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i], inf_tuple[3][i])) f.close() elif computation_cmd == "suscept_test": use_artists = "songs" if "songs" in dataset_path else "artists" interact_type_str = "listen" if interact_type == 0 else "love" #M = [50]#,20]#,30,40,50] t_scale = ord('o') # ordinal scale, this is the default used in paper. NUM_NODES_TO_COMPUTE = 4000000 # maximum number nodes to compute? num_threads = 4 # the number of threads to spawn max_tries_val = None #30000 # should we stop after max_tries? max_node_computes_val = NUM_NODES_TO_COMPUTE / num_threads # number of nodes to compute at each node #max_interact_ratio_error =0.2 # these are errors (defaults are 0.1,0.1) #max_sim_ratio_error = 0.2 #min_friends_match_ratio = 0.5 # important to be 1 for simulation--because e.g. in influence, we use a person's all friends to compute his next like klim_val = None # not used for influence test nonfr_match = "random" #random, serial, kbest. Default is random. num_loop = 1 # number of times we calculate this. For averaging results over multiple runs. f = open( "suscept_test_results/" + dataset_domain + dataset_path.split("/")[-2] + interact_type_str + strftime("%Y-%m-%d_%H:%M:%S") + '.dat', 'w') f.write( "# use_artists=%r\tallow_duplicates=%r\tmax_node_computes_val=%d\tcreate_fake_prefs=%r\tnum_loop=%d\n" % (use_artists, allow_duplicates, max_node_computes_val, create_fake_prefs, num_loop)) f.write( "# split_train_test_date=%s\ttime_scale=%d\tmin_interactions_beforeaftersplit_per_user=%d\tnum_threads=%d\n" % (split_date_str, t_scale, min_interacts_beforeaftersplit_per_user, num_threads)) f.write( "# max_interact_ratio_error=%f\tmax_sim_ratio_error=%f\tmin_friends_match_ratio=%f\n" % (max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio)) for t_window in M: for h in range(num_loop): f.write( "\n\n################### ALERTINFO: STARTING ITERATION %d with M=%d\n" % (h, t_window)) if split_date_str == "test": split_timestamp = 2000 else: split_timestamp = int( time.mktime( datetime.datetime.strptime( split_date_str, "%Y/%m/%d").timetuple())) #split_timestamp=25000000 if create_fake_prefs is not None: data.create_training_test_bytime(interact_type, split_timestamp) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences( data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) # Need to generate again because fake data changes test data data.create_training_test_bytime( interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user) la = LocalityAnalyzer(data) inf_tuple = compute.test_influence( la, interact_type=interact_type, time_diff=t_window, time_scale=t_scale, split_timestamp=split_timestamp, #time_diff=100000, split_date_str="1970/06/23", control_divider=0.01, # not used anymore min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, max_tries=max_tries_val, max_node_computes=max_node_computes_val, num_threads=num_threads, max_interact_ratio_error=max_interact_ratio_error, max_sim_ratio_error=max_sim_ratio_error, min_friends_match_ratio=min_friends_match_ratio, klim=klim_val, nonfr_match=nonfr_match, method="suscept", allow_duplicates=allow_duplicates) print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3]) num_vals = len(inf_tuple[0]) f.write( "TestSetSize\tFrSimilarity\tNonFrSimilarity\tFrOverlap\tNonFrOverlap\tRandom_run_no\tM\n" ) for i in range(num_vals): f.write("%d\t%f\t%f\t%f\t%f\t%d\t%d\n" % (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i], inf_tuple[3][i], inf_tuple[4][i], h, t_window)) f.close() elif computation_cmd == "gen_adopt_data": t_window = 100 t_scale = ord('o') if split_date_str == "test": split_timestamp = 2000 else: split_timestamp = int( time.mktime( datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) if create_fake_prefs is not None: data.create_training_test_bytime(interact_type, split_timestamp) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences( data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) data.create_training_test_bytime(interact_type, split_timestamp) gen_adopt.generate_adoption_data( data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale) elif computation_cmd == "compute_split_date": ret_timestamp = compute.compute_cutoff_date(data, interact_type, traindata_fraction) print ret_timestamp print datetime.datetime.fromtimestamp(ret_timestamp * 86400).strftime("%Y-%m-%d") """