def compare_meanified_array(items_pop1, items_pop2): arr1 = [] arr2 = [] for k, v in sorted(items_pop1.iteritems(), key=operator.itemgetter(1)): if v >= 10: arr1.append(v) if k in items_pop2: arr2.append(items_pop2[k]) else: arr2.append(0) plotter.plotLinesYY(get_meanified_array(arr1, 100, method="median"), get_meanified_array(arr2, 100, method="median"), "listen", "love", labelx="Items", labely="Number of people who interact with this item", display=True, logyscale=True) plotter.plotLinesYY(get_meanified_array(arr1, 100, method="mean"), get_meanified_array(arr2, 100, method="mean"), "listen", "love", labelx="Items", labely="Number of people who interact with this item", display=True, logyscale=True) return
def compare_meanified_array(items_pop1, items_pop2): arr1 = [] arr2 = [] for k, v in sorted(items_pop1.iteritems(), key=operator.itemgetter(1)): if v >= 10: arr1.append(v) if k in items_pop2: arr2.append(items_pop2[k]) else: arr2.append(0) plotter.plotLinesYY( get_meanified_array(arr1, 100, method="median"), get_meanified_array(arr2, 100, method="median"), "listen", "love", labelx="Items", labely="Number of people who interact with this item", display=True, logyscale=True, ) plotter.plotLinesYY( get_meanified_array(arr1, 100, method="mean"), get_meanified_array(arr2, 100, method="mean"), "listen", "love", labelx="Items", labely="Number of people who interact with this item", display=True, logyscale=True, ) return
def compare_interact_types_byuser(na, itype1, itype2, binwidth, duplicates=True, min_exposure=1, plot_type="xy", logyscale=False, logxscale=False): y = na.compare_interaction_types2(min_exposure=min_exposure, return_duplicates=duplicates) tuple_arr = [(y[itype1][i], y[itype2][i]) for i in range(len(y[itype1])) if y[itype1][i] > 0] itype1_arr = [val[0] for val in tuple_arr] itype2_arr = [val[1] for val in tuple_arr] print max(itype1_arr), max(itype2_arr) itype1_arr2 = get_meanified_array(itype1_arr, binwidth) itype2_arr2 = get_meanified_array(itype2_arr, binwidth) labely_part = "Interactions" if duplicates else "Artists" if plot_type == "yy": plotter.plotLinesYY(itype1_arr2, itype2_arr2, itype1, itype2, labelx="Users (sorted by listen)", labely="Number of " + labely_part, display=True, logyscale=False) elif plot_type == "xy": """ plotter.plotLinesXY(itype1_arr, itype2_arr, labelx=itype1, labely=itype2, title_str="Number of "+ labely_part, display=True, logyscale=logyscale, logxscale=logxscale, ylim_val=[0, 100000]) """ import matplotlib.pyplot as plt plt.loglog(itype1_arr, itype2_arr, 'o') plt.show() return itype1_arr, itype2_arr
def compare_interact_types_byuser( na, itype1, itype2, binwidth, duplicates=True, min_exposure=1, plot_type="xy", logyscale=False, logxscale=False ): y = na.compare_interaction_types2(min_exposure=min_exposure, return_duplicates=duplicates) tuple_arr = [(y[itype1][i], y[itype2][i]) for i in range(len(y[itype1])) if y[itype1][i] > 0] itype1_arr = [val[0] for val in tuple_arr] itype2_arr = [val[1] for val in tuple_arr] print max(itype1_arr), max(itype2_arr) itype1_arr2 = get_meanified_array(itype1_arr, binwidth) itype2_arr2 = get_meanified_array(itype2_arr, binwidth) labely_part = "Interactions" if duplicates else "Artists" if plot_type == "yy": plotter.plotLinesYY( itype1_arr2, itype2_arr2, itype1, itype2, labelx="Users (sorted by listen)", labely="Number of " + labely_part, display=True, logyscale=False, ) elif plot_type == "xy": """ plotter.plotLinesXY(itype1_arr, itype2_arr, labelx=itype1, labely=itype2, title_str="Number of "+ labely_part, display=True, logyscale=logyscale, logxscale=logxscale, ylim_val=[0, 100000]) """ import matplotlib.pyplot as plt plt.loglog(itype1_arr, itype2_arr, "o") plt.show() return itype1_arr, itype2_arr
def run_computation(data, computation_cmd, outf, interact_type, create_fake_prefs, allow_duplicates, split_date_str, dataset_domain, dataset_path, min_interacts_beforeaftersplit_per_user, max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio, traindata_fraction, M): net_analyzer = BasicNetworkAnalyzer(data) interaction_types = data.interact_types_dict filename_prefix = computation_cmd if computation_cmd is not None else "" if computation_cmd=="basic_stats" or computation_cmd is None: net_analyzer.show_basic_stats() ## use below if you want to write a new dataset (e.g. after filtering) data.store_ego_dataset("/home/amit/datasets/social_activity_data/lastfm_filtered_listen/", write_maps=False) #data.compute_allpairs_sim(interact_type, data_type=ord("a")) elif computation_cmd=="random_similarity": for type_name, type_index in interaction_types.iteritems(): circlesims, globalsims = net_analyzer.compare_circle_global_similarity(type_index, num_random_trials=5, cutoff_rating=cutoff_rating) #plotter.plotLinesYY(circlesims, globalsims, "Friends", "Global") outf.write("User_id\tcircle_sim\tnonfriend_sim\n") outf.write(type_name + '\n') for ind in range(len(circlesims)): outf.write("%s\t%f\t%f\n" %(circlesims[ind][0], circlesims[ind][1], globalsims[ind][1])) print "\n", type_name, ":" print "Circle Average", sum([v2 for v1,v2 in circlesims])/float(len(circlesims)) print "Global Average", sum([v2 for v1,v2 in globalsims])/float(len(globalsims)) elif computation_cmd=="knn_similarity": #Compute K-nearest similarity KLIMITS = [10] outf.write("User_id\tk\tcircle_sim\tnonfriend_sim\n") for type_name, type_index in interaction_types.iteritems(): for curr_lim in KLIMITS: plot_circle, plot_external = net_analyzer.compare_circle_global_knnsimilarity(type_index, klim=curr_lim, cutoff_rating=cutoff_rating) compare_sims(plot_circle, plot_external) outf.write(type_name+'\n') for ind in range(len(plot_circle)): outf.write("%s\t%d\t%f\t%f\n" %(plot_circle[ind][0], curr_lim, plot_circle[ind][1], plot_external[ind][1])) #plotter.plotLinesYY(plot_circle, plot_external, "Friends", "Global") print type_name, "K", curr_lim print "Circle Average", utils.mean_sd([v2 for v1,v2 in plot_circle]), len(plot_circle) print "Global Average", utils.mean_sd([v2 for v1,v2 in plot_external]), len(plot_external) elif computation_cmd=="knn_recommender": #Compute K-nearest recommender KLIMITS = [10] rec_analyzer = RecommenderAnalyzer(data, max_recs_shown=10, traintest_split=0.7, cutoff_rating=cutoff_rating) outf.write("User_id\tk\trun_index\tcircle_ndcg\tnonfriend_ndcg\n") for type_name, type_index in interaction_types.iteritems(): for curr_lim in KLIMITS: local_avg=[] global_avg=[] Ntotal = 10 for i in range(Ntotal): # randomize because of training-test split. plot_circle, plot_external = rec_analyzer.compare_knearest_recommenders(type_index, klim=curr_lim, num_processes=2) compare_sims(plot_circle, plot_external) outf.write(type_name + "\n") for ind in range(len(plot_circle)): outf.write("%s\t%d\t%d\t%f\t%f\n" %(plot_circle[ind][0], curr_lim, i, plot_circle[ind][1], plot_external[ind][1])) print "\n", type_name, "K", curr_lim #print plot_circle, plot_external curr_avg_local = utils.mean_sd([v2 for v1,v2 in plot_circle]) curr_avg_global = utils.mean_sd([v2 for v1,v2 in plot_external]) print "Circle Average", curr_avg_local print "Global Average", curr_avg_global local_avg.append(curr_avg_local[0]) global_avg.append(curr_avg_global[0]) #plotLinesYY(plot_circle, plot_external, "Friends", "Global") print "Local", sum(local_avg)/float(Ntotal) print "Global", sum(global_avg)/float(Ntotal) elif computation_cmd == "circle_coverage": lim_friends = [(5,10), (10,20), (20,50), (50,100)] for fr_limit in lim_friends: locality_analyzer = LocalityAnalyzer(data) coverage_list = locality_analyzer.compare_circle_item_coverages(0, fr_limit[0], fr_limit[1]) plotter.plotLineY(sorted(coverage_list), "User", "Fraction of Items Covered with %d-%d friends" % (fr_limit[0], fr_limit[1])) print utils.mean_sd(coverage_list) elif computation_cmd == "items_edge_coverage": locality_analyzer = LocalityAnalyzer(data) items_cov_list, items_popularity, cov_ratio_list = locality_analyzer.compare_items_edge_coverage(1, minimum_interactions=1) print utils.mean_sd(items_cov_list) print utils.mean_sd(items_popularity) #plotter.plotHist(sorted([val for val in cov_ratio_list if val<=1]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #plotter.plotHist(sorted(items_popularity), "Item", "total popularity") plotter.plotCumulativePopularity(items_popularity, labelx="Item percentile", labely="Cum. percent of number of likes") elif computation_cmd == "network_draw": net_visualizor = NetworkVisualizor(data) net_visualizor.draw_network() elif computation_cmd == "network_item_adopt": net_visualizor = NetworkVisualizor(data) pprint(net_visualizor.plot_item_adoption(1669118)) elif computation_cmd == "node_details": for node_id in open('user_ids'): if node_id.strip('\n') != "User_id": net_analyzer.get_node_details(int(node_id.strip('\n'))) elif computation_cmd=="store_dataset": user_interacts = net_analyzer.get_user_interacts(1, cutoff_rating) f = open(outf_path+ 'user_interacts_'+dataset_domain+'.tsv', 'w') f.write("user_id\titem_id\ttimestamp\n") for user_id, item_id, timestamp in user_interacts: f.write("%s\t%s\t%s\n" %(user_id, item_id, timestamp)) f.close() item_pop = net_analyzer.get_items_popularity(1, cutoff_rating) f = open(outf_path+'items_'+dataset_domain+'.tsv','w') f.write("item_id\tpopularity\n") for item_id, pop in item_pop.iteritems(): f.write("%s\t%s\n" %(item_id, pop)) f.close() user_friends = net_analyzer.get_user_friends() f = open('user_friends_'+dataset_domain+'.tsv','w') f.write("user_id\tfriend_id\n") for user_id, friend_id in user_friends: f.write("%s\t%s\n" %(user_id, friend_id)) f.close() print "Successfully stored tsv dataset" elif computation_cmd=="compare_interact_types": num_interacts_dict = net_analyzer.compare_interaction_types() interact_types = num_interacts_dict.keys() plotter.plotLinesYY(num_interacts_dict[interact_types[0]], num_interacts_dict[interact_types[1]], interact_types[0], interact_types[1], display=True, logyscale=True) plotter.plotLinesYY(num_interacts_dict[interact_types[1]], num_interacts_dict[interact_types[2]], interact_types[1], interact_types[2], display=True, logyscale=True) plotter.plotLinesYY(num_interacts_dict[interact_types[0]], num_interacts_dict[interact_types[2]], interact_types[0], interact_types[2], display=True, logyscale=True) elif computation_cmd=="influence_test": # ta = TemporalAnalyzer(data) #interact_type = data.interact_types_dict["listen" # time_scale can be 'w':wallclock_time or 'o':ordinal_time split_date_str = "2008/01/01" t_window = -1 t_scale = ord('w') max_tries_val = 10000 max_node_computes_val = 100 max_interact_ratio_error = 0.1 klim_val=5 split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) # crate trainig test sets that will be used by fake geernation data.create_training_test_bytime(interact_type, split_timestamp) if create_fake_prefs is not None: print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences(data,interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) #fake_data.generate_random_preferences(data, interact_type, split_timestamp) print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) # Need to generate again because fake data changes test data data.create_training_test_bytime(interact_type, split_timestamp) la = LocalityAnalyzer(data) inf_tuple = compute.test_influence(la, interact_type=interact_type, time_diff=t_window, time_scale=ord('w'), split_timestamp=split_timestamp, #time_diff=100000, split_date_str="1970/06/23", control_divider=0.01, min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user, max_tries = max_tries_val, max_node_computes=max_node_computes_val, num_processes=4, max_interact_ratio_error=max_interact_ratio_error, klim=klim_val, method="influence") print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3]) num_vals = len(inf_tuple[0]) f = open("influence_test", "w") for i in range(num_vals): f.write("%f\t%f\t%f\t%f\n" % (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i], inf_tuple[3][i])) f.close() elif computation_cmd=="suscept_test": use_artists = "songs" if "songs" in dataset_path else "artists" interact_type_str = "listen" if interact_type==0 else "love" #M = [50]#,20]#,30,40,50] t_scale = ord('o') # ordinal scale, this is the default used in paper. NUM_NODES_TO_COMPUTE = 4000000 # maximum number nodes to compute? num_threads=4 # the number of threads to spawn max_tries_val = None#30000 # should we stop after max_tries? max_node_computes_val = NUM_NODES_TO_COMPUTE/num_threads # number of nodes to compute at each node #max_interact_ratio_error =0.2 # these are errors (defaults are 0.1,0.1) #max_sim_ratio_error = 0.2 #min_friends_match_ratio = 0.5 # important to be 1 for simulation--because e.g. in influence, we use a person's all friends to compute his next like klim_val = None # not used for influence test nonfr_match = "random" #random, serial, kbest. Default is random. num_loop = 1 # number of times we calculate this. For averaging results over multiple runs. f = open("suscept_test_results/"+dataset_domain + dataset_path.split("/")[-2] + interact_type_str+ strftime("%Y-%m-%d_%H:%M:%S")+'.dat', 'w') f.write("# use_artists=%r\tallow_duplicates=%r\tmax_node_computes_val=%d\tcreate_fake_prefs=%r\tnum_loop=%d\n" % ( use_artists, allow_duplicates, max_node_computes_val, create_fake_prefs, num_loop)) f.write("# split_train_test_date=%s\ttime_scale=%d\tmin_interactions_beforeaftersplit_per_user=%d\tnum_threads=%d\n" % ( split_date_str, t_scale, min_interacts_beforeaftersplit_per_user, num_threads)) f.write("# max_interact_ratio_error=%f\tmax_sim_ratio_error=%f\tmin_friends_match_ratio=%f\n" %( max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio )) for t_window in M: for h in range(num_loop): f.write("\n\n################### ALERTINFO: STARTING ITERATION %d with M=%d\n" %( h, t_window)) if split_date_str=="test": split_timestamp = 2000 else: split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) #split_timestamp=25000000 if create_fake_prefs is not None: data.create_training_test_bytime(interact_type, split_timestamp) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences(data,interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) # Need to generate again because fake data changes test data data.create_training_test_bytime(interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user) la = LocalityAnalyzer(data) inf_tuple = compute.test_influence(la, interact_type=interact_type, time_diff=t_window, time_scale=t_scale, split_timestamp=split_timestamp, #time_diff=100000, split_date_str="1970/06/23", control_divider=0.01, # not used anymore min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user, max_tries = max_tries_val, max_node_computes=max_node_computes_val, num_threads=num_threads, max_interact_ratio_error = max_interact_ratio_error, max_sim_ratio_error = max_sim_ratio_error, min_friends_match_ratio=min_friends_match_ratio, klim = klim_val, nonfr_match=nonfr_match, method="suscept", allow_duplicates=allow_duplicates) print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3]) num_vals = len(inf_tuple[0]) f.write("TestSetSize\tFrSimilarity\tNonFrSimilarity\tFrOverlap\tNonFrOverlap\tRandom_run_no\tM\n") for i in range(num_vals): f.write("%d\t%f\t%f\t%f\t%f\t%d\t%d\n" % (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i], inf_tuple[3][i], inf_tuple[4][i], h, t_window)) f.close() elif computation_cmd=="gen_adopt_data": t_window = 100 t_scale = ord('o') if split_date_str=="test": split_timestamp = 2000 else: split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) if create_fake_prefs is not None: data.create_training_test_bytime(interact_type, split_timestamp) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences(data,interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) data.create_training_test_bytime(interact_type, split_timestamp) gen_adopt.generate_adoption_data(data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale) elif computation_cmd=="compute_split_date": ret_timestamp = compute.compute_cutoff_date(data, interact_type, traindata_fraction) print ret_timestamp print datetime.datetime.fromtimestamp(ret_timestamp*86400).strftime("%Y-%m-%d") """
def run_computation(data, computation_cmd, outf, interact_type, create_fake_prefs, allow_duplicates, split_date_str, dataset_domain, dataset_path, min_interacts_beforeaftersplit_per_user, max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio, traindata_fraction, M): net_analyzer = BasicNetworkAnalyzer(data) interaction_types = data.interact_types_dict filename_prefix = computation_cmd if computation_cmd is not None else "" if computation_cmd == "basic_stats" or computation_cmd is None: net_analyzer.show_basic_stats() ## use below if you want to write a new dataset (e.g. after filtering) data.store_ego_dataset( "/home/amit/datasets/social_activity_data/lastfm_filtered_listen/", write_maps=False) #data.compute_allpairs_sim(interact_type, data_type=ord("a")) elif computation_cmd == "random_similarity": for type_name, type_index in interaction_types.iteritems(): circlesims, globalsims = net_analyzer.compare_circle_global_similarity( type_index, num_random_trials=5, cutoff_rating=cutoff_rating) #plotter.plotLinesYY(circlesims, globalsims, "Friends", "Global") outf.write("User_id\tcircle_sim\tnonfriend_sim\n") outf.write(type_name + '\n') for ind in range(len(circlesims)): outf.write("%s\t%f\t%f\n" % (circlesims[ind][0], circlesims[ind][1], globalsims[ind][1])) print "\n", type_name, ":" print "Circle Average", sum([v2 for v1, v2 in circlesims]) / float( len(circlesims)) print "Global Average", sum([v2 for v1, v2 in globalsims]) / float( len(globalsims)) elif computation_cmd == "knn_similarity": #Compute K-nearest similarity KLIMITS = [10] outf.write("User_id\tk\tcircle_sim\tnonfriend_sim\n") for type_name, type_index in interaction_types.iteritems(): for curr_lim in KLIMITS: plot_circle, plot_external = net_analyzer.compare_circle_global_knnsimilarity( type_index, klim=curr_lim, cutoff_rating=cutoff_rating) compare_sims(plot_circle, plot_external) outf.write(type_name + '\n') for ind in range(len(plot_circle)): outf.write("%s\t%d\t%f\t%f\n" % (plot_circle[ind][0], curr_lim, plot_circle[ind][1], plot_external[ind][1])) #plotter.plotLinesYY(plot_circle, plot_external, "Friends", "Global") print type_name, "K", curr_lim print "Circle Average", utils.mean_sd( [v2 for v1, v2 in plot_circle]), len(plot_circle) print "Global Average", utils.mean_sd( [v2 for v1, v2 in plot_external]), len(plot_external) elif computation_cmd == "knn_recommender": #Compute K-nearest recommender KLIMITS = [10] rec_analyzer = RecommenderAnalyzer(data, max_recs_shown=10, traintest_split=0.7, cutoff_rating=cutoff_rating) outf.write("User_id\tk\trun_index\tcircle_ndcg\tnonfriend_ndcg\n") for type_name, type_index in interaction_types.iteritems(): for curr_lim in KLIMITS: local_avg = [] global_avg = [] Ntotal = 10 for i in range( Ntotal): # randomize because of training-test split. plot_circle, plot_external = rec_analyzer.compare_knearest_recommenders( type_index, klim=curr_lim, num_processes=2) compare_sims(plot_circle, plot_external) outf.write(type_name + "\n") for ind in range(len(plot_circle)): outf.write( "%s\t%d\t%d\t%f\t%f\n" % (plot_circle[ind][0], curr_lim, i, plot_circle[ind][1], plot_external[ind][1])) print "\n", type_name, "K", curr_lim #print plot_circle, plot_external curr_avg_local = utils.mean_sd( [v2 for v1, v2 in plot_circle]) curr_avg_global = utils.mean_sd( [v2 for v1, v2 in plot_external]) print "Circle Average", curr_avg_local print "Global Average", curr_avg_global local_avg.append(curr_avg_local[0]) global_avg.append(curr_avg_global[0]) #plotLinesYY(plot_circle, plot_external, "Friends", "Global") print "Local", sum(local_avg) / float(Ntotal) print "Global", sum(global_avg) / float(Ntotal) elif computation_cmd == "circle_coverage": lim_friends = [(5, 10), (10, 20), (20, 50), (50, 100)] for fr_limit in lim_friends: locality_analyzer = LocalityAnalyzer(data) coverage_list = locality_analyzer.compare_circle_item_coverages( 0, fr_limit[0], fr_limit[1]) plotter.plotLineY( sorted(coverage_list), "User", "Fraction of Items Covered with %d-%d friends" % (fr_limit[0], fr_limit[1])) print utils.mean_sd(coverage_list) elif computation_cmd == "items_edge_coverage": locality_analyzer = LocalityAnalyzer(data) items_cov_list, items_popularity, cov_ratio_list = locality_analyzer.compare_items_edge_coverage( 1, minimum_interactions=1) print utils.mean_sd(items_cov_list) print utils.mean_sd(items_popularity) #plotter.plotHist(sorted([val for val in cov_ratio_list if val<=1]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True) #plotter.plotHist(sorted(items_popularity), "Item", "total popularity") plotter.plotCumulativePopularity( items_popularity, labelx="Item percentile", labely="Cum. percent of number of likes") elif computation_cmd == "network_draw": net_visualizor = NetworkVisualizor(data) net_visualizor.draw_network() elif computation_cmd == "network_item_adopt": net_visualizor = NetworkVisualizor(data) pprint(net_visualizor.plot_item_adoption(1669118)) elif computation_cmd == "node_details": for node_id in open('user_ids'): if node_id.strip('\n') != "User_id": net_analyzer.get_node_details(int(node_id.strip('\n'))) elif computation_cmd == "store_dataset": user_interacts = net_analyzer.get_user_interacts(1, cutoff_rating) f = open(outf_path + 'user_interacts_' + dataset_domain + '.tsv', 'w') f.write("user_id\titem_id\ttimestamp\n") for user_id, item_id, timestamp in user_interacts: f.write("%s\t%s\t%s\n" % (user_id, item_id, timestamp)) f.close() item_pop = net_analyzer.get_items_popularity(1, cutoff_rating) f = open(outf_path + 'items_' + dataset_domain + '.tsv', 'w') f.write("item_id\tpopularity\n") for item_id, pop in item_pop.iteritems(): f.write("%s\t%s\n" % (item_id, pop)) f.close() user_friends = net_analyzer.get_user_friends() f = open('user_friends_' + dataset_domain + '.tsv', 'w') f.write("user_id\tfriend_id\n") for user_id, friend_id in user_friends: f.write("%s\t%s\n" % (user_id, friend_id)) f.close() print "Successfully stored tsv dataset" elif computation_cmd == "compare_interact_types": num_interacts_dict = net_analyzer.compare_interaction_types() interact_types = num_interacts_dict.keys() plotter.plotLinesYY(num_interacts_dict[interact_types[0]], num_interacts_dict[interact_types[1]], interact_types[0], interact_types[1], display=True, logyscale=True) plotter.plotLinesYY(num_interacts_dict[interact_types[1]], num_interacts_dict[interact_types[2]], interact_types[1], interact_types[2], display=True, logyscale=True) plotter.plotLinesYY(num_interacts_dict[interact_types[0]], num_interacts_dict[interact_types[2]], interact_types[0], interact_types[2], display=True, logyscale=True) elif computation_cmd == "influence_test": # ta = TemporalAnalyzer(data) #interact_type = data.interact_types_dict["listen" # time_scale can be 'w':wallclock_time or 'o':ordinal_time split_date_str = "2008/01/01" t_window = -1 t_scale = ord('w') max_tries_val = 10000 max_node_computes_val = 100 max_interact_ratio_error = 0.1 klim_val = 5 split_timestamp = int( time.mktime( datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) # crate trainig test sets that will be used by fake geernation data.create_training_test_bytime(interact_type, split_timestamp) if create_fake_prefs is not None: print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences( data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) #fake_data.generate_random_preferences(data, interact_type, split_timestamp) print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) # Need to generate again because fake data changes test data data.create_training_test_bytime(interact_type, split_timestamp) la = LocalityAnalyzer(data) inf_tuple = compute.test_influence( la, interact_type=interact_type, time_diff=t_window, time_scale=ord('w'), split_timestamp=split_timestamp, #time_diff=100000, split_date_str="1970/06/23", control_divider=0.01, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, max_tries=max_tries_val, max_node_computes=max_node_computes_val, num_processes=4, max_interact_ratio_error=max_interact_ratio_error, klim=klim_val, method="influence") print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3]) num_vals = len(inf_tuple[0]) f = open("influence_test", "w") for i in range(num_vals): f.write("%f\t%f\t%f\t%f\n" % (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i], inf_tuple[3][i])) f.close() elif computation_cmd == "suscept_test": use_artists = "songs" if "songs" in dataset_path else "artists" interact_type_str = "listen" if interact_type == 0 else "love" #M = [50]#,20]#,30,40,50] t_scale = ord('o') # ordinal scale, this is the default used in paper. NUM_NODES_TO_COMPUTE = 4000000 # maximum number nodes to compute? num_threads = 4 # the number of threads to spawn max_tries_val = None #30000 # should we stop after max_tries? max_node_computes_val = NUM_NODES_TO_COMPUTE / num_threads # number of nodes to compute at each node #max_interact_ratio_error =0.2 # these are errors (defaults are 0.1,0.1) #max_sim_ratio_error = 0.2 #min_friends_match_ratio = 0.5 # important to be 1 for simulation--because e.g. in influence, we use a person's all friends to compute his next like klim_val = None # not used for influence test nonfr_match = "random" #random, serial, kbest. Default is random. num_loop = 1 # number of times we calculate this. For averaging results over multiple runs. f = open( "suscept_test_results/" + dataset_domain + dataset_path.split("/")[-2] + interact_type_str + strftime("%Y-%m-%d_%H:%M:%S") + '.dat', 'w') f.write( "# use_artists=%r\tallow_duplicates=%r\tmax_node_computes_val=%d\tcreate_fake_prefs=%r\tnum_loop=%d\n" % (use_artists, allow_duplicates, max_node_computes_val, create_fake_prefs, num_loop)) f.write( "# split_train_test_date=%s\ttime_scale=%d\tmin_interactions_beforeaftersplit_per_user=%d\tnum_threads=%d\n" % (split_date_str, t_scale, min_interacts_beforeaftersplit_per_user, num_threads)) f.write( "# max_interact_ratio_error=%f\tmax_sim_ratio_error=%f\tmin_friends_match_ratio=%f\n" % (max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio)) for t_window in M: for h in range(num_loop): f.write( "\n\n################### ALERTINFO: STARTING ITERATION %d with M=%d\n" % (h, t_window)) if split_date_str == "test": split_timestamp = 2000 else: split_timestamp = int( time.mktime( datetime.datetime.strptime( split_date_str, "%Y/%m/%d").timetuple())) #split_timestamp=25000000 if create_fake_prefs is not None: data.create_training_test_bytime(interact_type, split_timestamp) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences( data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) # Need to generate again because fake data changes test data data.create_training_test_bytime( interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user) la = LocalityAnalyzer(data) inf_tuple = compute.test_influence( la, interact_type=interact_type, time_diff=t_window, time_scale=t_scale, split_timestamp=split_timestamp, #time_diff=100000, split_date_str="1970/06/23", control_divider=0.01, # not used anymore min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, max_tries=max_tries_val, max_node_computes=max_node_computes_val, num_threads=num_threads, max_interact_ratio_error=max_interact_ratio_error, max_sim_ratio_error=max_sim_ratio_error, min_friends_match_ratio=min_friends_match_ratio, klim=klim_val, nonfr_match=nonfr_match, method="suscept", allow_duplicates=allow_duplicates) print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3]) num_vals = len(inf_tuple[0]) f.write( "TestSetSize\tFrSimilarity\tNonFrSimilarity\tFrOverlap\tNonFrOverlap\tRandom_run_no\tM\n" ) for i in range(num_vals): f.write("%d\t%f\t%f\t%f\t%f\t%d\t%d\n" % (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i], inf_tuple[3][i], inf_tuple[4][i], h, t_window)) f.close() elif computation_cmd == "gen_adopt_data": t_window = 100 t_scale = ord('o') if split_date_str == "test": split_timestamp = 2000 else: split_timestamp = int( time.mktime( datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple())) if create_fake_prefs is not None: data.create_training_test_bytime(interact_type, split_timestamp) #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1) fake_data.generate_fake_preferences( data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale, method=create_fake_prefs) data.create_training_test_bytime(interact_type, split_timestamp) gen_adopt.generate_adoption_data( data, interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user= min_interacts_beforeaftersplit_per_user, time_window=t_window, time_scale=t_scale) elif computation_cmd == "compute_split_date": ret_timestamp = compute.compute_cutoff_date(data, interact_type, traindata_fraction) print ret_timestamp print datetime.datetime.fromtimestamp(ret_timestamp * 86400).strftime("%Y-%m-%d") """