def influence(la):
    split_timestamp = int(time.mktime(datetime.datetime.strptime("2013/01/01", "%Y/%m/%d").timetuple()))
    time_diff = 86400
    influence_tuples = la.estimate_influencer_effect(1, split_timestamp, time_diff,
                                                     control_divider=0.1, 
                                                     selection_method="random",
                                                     klim=5)
    mean_fr_influence = utils.mean_sd([v[5] for v in influence_tuples]) 
    mean_nonfr_influence = utils.mean_sd([v[6] for v in influence_tuples])
    print mean_fr_influence, mean_nonfr_influence
def locality(la):
    items_cov_list, items_popularity, cov_ratio_list, degree_distr = la.compare_items_edge_coverage(1, minimum_interactions=1)
    print utils.mean_sd(items_cov_list)
    print utils.mean_sd(items_popularity)
    plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Common Likes with friends to total popularity", "Frequency (Number of items)", logyscale=True, bins=20)
    #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
    #plotter.plotHist(sorted(items_popularity), "Item", "total popularity")
    plotter.plotCumulativePopularity(items_popularity, labelx="Item percentile", labely="Cum. percent of number of likes")
    f_out = open('plots/data/influenced_loves_ratio.tsv', 'w')
    for i in range(len(items_cov_list)):
        f_out.write(str(items_cov_list[i])+' '+str(items_popularity[i])+' '+str(cov_ratio_list[i])+'\n')
    f_out.close()
示例#3
0
def influence(la):
    split_timestamp = int(
        time.mktime(
            datetime.datetime.strptime("2013/01/01", "%Y/%m/%d").timetuple()))
    time_diff = 86400
    influence_tuples = la.estimate_influencer_effect(1,
                                                     split_timestamp,
                                                     time_diff,
                                                     control_divider=0.1,
                                                     selection_method="random",
                                                     klim=5)
    mean_fr_influence = utils.mean_sd([v[5] for v in influence_tuples])
    mean_nonfr_influence = utils.mean_sd([v[6] for v in influence_tuples])
    print mean_fr_influence, mean_nonfr_influence
示例#4
0
def locality(la):
    items_cov_list, items_popularity, cov_ratio_list, degree_distr = la.compare_items_edge_coverage(
        1, minimum_interactions=1)
    print utils.mean_sd(items_cov_list)
    print utils.mean_sd(items_popularity)
    plotter.plotHist(sorted([val for val in cov_ratio_list]),
                     "Ratio of Common Likes with friends to total popularity",
                     "Frequency (Number of items)",
                     logyscale=True,
                     bins=20)
    #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
    #plotter.plotHist(sorted(items_popularity), "Item", "total popularity")
    plotter.plotCumulativePopularity(items_popularity,
                                     labelx="Item percentile",
                                     labely="Cum. percent of number of likes")
    f_out = open('plots/data/influenced_loves_ratio.tsv', 'w')
    for i in range(len(items_cov_list)):
        f_out.write(
            str(items_cov_list[i]) + ' ' + str(items_popularity[i]) + ' ' +
            str(cov_ratio_list[i]) + '\n')
    f_out.close()
def test_influence(la, interact_type, time_diff, time_scale, split_timestamp, control_divider,
                   min_interactions_beforeaftersplit_per_user, max_tries, max_node_computes, num_threads,
                   max_interact_ratio_error,max_sim_ratio_error, 
                   min_friends_match_ratio, klim, nonfr_match,
                   method, allow_duplicates):
    
    #time_diff = 90000 #86400
    influence_tuples = la.estimate_influencer_effect_parallel(interact_type, split_timestamp, 
                                                     time_diff, time_scale,  
                                                     control_divider=control_divider,
                                                     min_interactions_beforeaftersplit_per_user=min_interactions_beforeaftersplit_per_user,
                                                     selection_method="random",
                                                     klim=klim, max_tries=max_tries,
                                                     max_node_computes=max_node_computes,
                                                     num_threads=num_threads,
                                                     max_interact_ratio_error=max_interact_ratio_error,
                                                     max_sim_ratio_error=max_sim_ratio_error,
                                                     min_friends_match_ratio=min_friends_match_ratio,
                                                     nonfr_match = nonfr_match,
                                                     method=method, 
                                                     allow_duplicates=allow_duplicates)
    node_test_set_sizes = [v[0] for v in influence_tuples]
    fr_inf_vals = [v[5] for v in influence_tuples]
    nonfr_inf_vals = [v[6] for v in influence_tuples]
    diff_inf_vals = [v[5]-v[6] for v in influence_tuples]
    # Considering only positive values 
    diff_inf_vals_positiveonly = [v[5]-v[6] if v[5]-v[6] > 0 else 0 for v in influence_tuples]
    mean_fr_influence = utils.mean_sd(fr_inf_vals)
    mean_nonfr_influence = utils.mean_sd(nonfr_inf_vals)
    mean_diff_influence = utils.mean_sd(diff_inf_vals)
    mean_diff_influence_positiveonly = utils.mean_sd(diff_inf_vals_positiveonly)

    fr_sim_vals = [v[3] for v in influence_tuples]
    nonfr_sim_vals = [v[4] for v in influence_tuples]
    diff_sim_vals = [v[3]-v[4] for v in influence_tuples]
    mean_fr_sim = utils.mean_sd(fr_sim_vals)
    mean_nonfr_sim = utils.mean_sd(nonfr_sim_vals)
    mean_diff_sim = utils.mean_sd(diff_sim_vals)

    print "\nTest Results"
    print "Mean FrSim={0}, Mean NonFrSim={1}, MeanDiff={2}".format(mean_fr_sim, mean_nonfr_sim, mean_diff_sim)
    print "MeanFrInf={0}, Mean NonFrInf={1}".format(mean_fr_influence, mean_nonfr_influence)
    print "MeanDiff={0}, MeanDiffPositiveOnly={1}".format(mean_diff_influence, mean_diff_influence_positiveonly)
    print len(fr_inf_vals), len(nonfr_inf_vals)
    return node_test_set_sizes, fr_sim_vals, nonfr_sim_vals, fr_inf_vals, nonfr_inf_vals
    def show_basic_stats(self):
        num_interactdata_users = sum([1 for _ in self.netdata.get_nodes_iterable(should_have_interactions=True)])
        print "Number of Users with interaction data", num_interactdata_users
        num_frienddata_users = sum([1 for _ in self.netdata.get_nodes_iterable(should_have_friends=True)])
        print "Number of Users with friendship data", num_frienddata_users

        num_overall_users = self.netdata.get_total_num_nodes()
        print "Number of overall users", num_overall_users

        fr_arr = [v.get_num_friends() for v in self.netdata.get_nodes_iterable(should_have_friends=True)]
        print "Mean, SD of number of friends per user", mean_sd(fr_arr)
        print "Number of users with zero friends", sum([1 for v in fr_arr if v == 0])
        
        num_items_all = self.netdata.get_total_num_items()
        print "Total number of items", num_items_all
        print "Types of interactions with items", self.netdata.interaction_types
        
        items_by_interaction = []
        for interact_type in self.netdata.interaction_types:
            items_by_interaction.append(set())

        for v in self.netdata.get_nodes_iterable(should_have_interactions=True):
            for interact_type in self.netdata.interaction_types:
                items_by_interaction[interact_type] |= set(v.get_items_interacted_with(interact_type))

        for i in range(len(items_by_interaction)):
            print( "--Total number of items with interaction %s: %d" %(i, len(items_by_interaction[i])) )
        
        sum_each_interaction_dict = self.get_sum_interactions_by_type()
        for interact_type, total_interacts in sum_each_interaction_dict.iteritems():
            #print(interact_type)
            print( "--Total, Mean, of %d-type interactions per user = (%d, %f"
                   %(interact_type, total_interacts, total_interacts/float(num_interactdata_users)))

            print( "--Total, Mean, of %d-type interactions per item = (%d, %f)"
                   %(interact_type, total_interacts, total_interacts/float(num_items_all)) )
        
        for interact_type in self.netdata.interaction_types:
            user_interacts = [ v.get_num_interactions(interact_type) for v in self.netdata.get_nodes_iterable(should_have_interactions=True)]
            print("--Min, Max %d-type interacts per user= %d, %d" %(interact_type, min(user_interacts), max(user_interacts)))
        print "Min., Max. timestamp of interactions", self.get_min_max_interact_times()
        return
def run_computation(data, computation_cmd, outf, interact_type, create_fake_prefs,
        allow_duplicates, split_date_str, dataset_domain, dataset_path,
        min_interacts_beforeaftersplit_per_user,
        max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio, 
        traindata_fraction, M):
    net_analyzer = BasicNetworkAnalyzer(data)
    interaction_types = data.interact_types_dict
    filename_prefix = computation_cmd if computation_cmd is not None else ""

    if computation_cmd=="basic_stats" or computation_cmd is None:
        net_analyzer.show_basic_stats()
        ## use below if you want to write a new dataset (e.g. after filtering)
        data.store_ego_dataset("/home/amit/datasets/social_activity_data/lastfm_filtered_listen/", write_maps=False)
        #data.compute_allpairs_sim(interact_type, data_type=ord("a"))

    elif computation_cmd=="random_similarity":
        for type_name, type_index in interaction_types.iteritems():
            circlesims, globalsims = net_analyzer.compare_circle_global_similarity(type_index, num_random_trials=5, cutoff_rating=cutoff_rating)
            #plotter.plotLinesYY(circlesims, globalsims, "Friends", "Global")
            outf.write("User_id\tcircle_sim\tnonfriend_sim\n")
            outf.write(type_name + '\n')
            for ind in range(len(circlesims)):
                outf.write("%s\t%f\t%f\n" %(circlesims[ind][0], circlesims[ind][1], globalsims[ind][1]))
            print "\n", type_name, ":" 
            print "Circle Average", sum([v2 for v1,v2 in circlesims])/float(len(circlesims))
            print "Global Average", sum([v2 for v1,v2 in globalsims])/float(len(globalsims))

    elif computation_cmd=="knn_similarity":
        #Compute K-nearest similarity
        KLIMITS = [10]
        outf.write("User_id\tk\tcircle_sim\tnonfriend_sim\n")
        
        for type_name, type_index in interaction_types.iteritems():
            for curr_lim in KLIMITS:
                plot_circle, plot_external = net_analyzer.compare_circle_global_knnsimilarity(type_index, klim=curr_lim, cutoff_rating=cutoff_rating)
                compare_sims(plot_circle, plot_external)
                outf.write(type_name+'\n')
                for ind in range(len(plot_circle)):
                    outf.write("%s\t%d\t%f\t%f\n" %(plot_circle[ind][0], curr_lim, plot_circle[ind][1], plot_external[ind][1]))
                #plotter.plotLinesYY(plot_circle, plot_external, "Friends", "Global")
                print type_name, "K", curr_lim
                print "Circle Average", utils.mean_sd([v2 for v1,v2 in plot_circle]), len(plot_circle)
                print "Global Average", utils.mean_sd([v2 for v1,v2 in plot_external]), len(plot_external)

    elif computation_cmd=="knn_recommender":
        #Compute K-nearest recommender
        KLIMITS = [10]
        rec_analyzer = RecommenderAnalyzer(data, max_recs_shown=10, traintest_split=0.7, cutoff_rating=cutoff_rating)
        outf.write("User_id\tk\trun_index\tcircle_ndcg\tnonfriend_ndcg\n")
        for type_name, type_index in interaction_types.iteritems():
            for curr_lim in KLIMITS:
                local_avg=[]
                global_avg=[]
                Ntotal = 10
                for i in range(Ntotal): # randomize because of training-test split.
                    plot_circle, plot_external = rec_analyzer.compare_knearest_recommenders(type_index, klim=curr_lim, num_processes=2)
                    compare_sims(plot_circle, plot_external)
                    outf.write(type_name + "\n")
                    for ind in range(len(plot_circle)):
                        outf.write("%s\t%d\t%d\t%f\t%f\n" %(plot_circle[ind][0], curr_lim, i, plot_circle[ind][1], plot_external[ind][1]))
                    print "\n", type_name, "K", curr_lim

                    #print plot_circle, plot_external
                    curr_avg_local = utils.mean_sd([v2 for v1,v2 in plot_circle])
                    curr_avg_global =  utils.mean_sd([v2 for v1,v2 in plot_external])
                    print "Circle Average", curr_avg_local
                    print "Global Average", curr_avg_global
                    local_avg.append(curr_avg_local[0])
                    global_avg.append(curr_avg_global[0])
                    #plotLinesYY(plot_circle, plot_external, "Friends", "Global")
                print "Local", sum(local_avg)/float(Ntotal)
                print "Global", sum(global_avg)/float(Ntotal)
    elif computation_cmd == "circle_coverage":
        lim_friends = [(5,10), (10,20), (20,50), (50,100)]
        for fr_limit in lim_friends:
            locality_analyzer = LocalityAnalyzer(data)
            coverage_list = locality_analyzer.compare_circle_item_coverages(0, fr_limit[0], fr_limit[1])
            plotter.plotLineY(sorted(coverage_list), "User", "Fraction of Items Covered with %d-%d friends" % (fr_limit[0], fr_limit[1]))
            print utils.mean_sd(coverage_list)
    elif computation_cmd == "items_edge_coverage":
        locality_analyzer = LocalityAnalyzer(data)
        items_cov_list, items_popularity, cov_ratio_list = locality_analyzer.compare_items_edge_coverage(1, minimum_interactions=1)
        print utils.mean_sd(items_cov_list)
        print utils.mean_sd(items_popularity)
        #plotter.plotHist(sorted([val for val in cov_ratio_list if val<=1]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
        #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
        #plotter.plotHist(sorted(items_popularity), "Item", "total popularity")
        plotter.plotCumulativePopularity(items_popularity, labelx="Item percentile", labely="Cum. percent of number of likes")
    elif computation_cmd == "network_draw":
        net_visualizor = NetworkVisualizor(data)
        net_visualizor.draw_network()
    elif computation_cmd == "network_item_adopt":
        net_visualizor = NetworkVisualizor(data)
        pprint(net_visualizor.plot_item_adoption(1669118))
    elif computation_cmd == "node_details":
        for node_id in open('user_ids'):
            if node_id.strip('\n') != "User_id":
                net_analyzer.get_node_details(int(node_id.strip('\n')))
    elif computation_cmd=="store_dataset":
        user_interacts = net_analyzer.get_user_interacts(1, cutoff_rating)
        f = open(outf_path+ 'user_interacts_'+dataset_domain+'.tsv', 'w')
        f.write("user_id\titem_id\ttimestamp\n")
        for user_id, item_id, timestamp in user_interacts:
            f.write("%s\t%s\t%s\n" %(user_id, item_id, timestamp)) 
        f.close()
        
        item_pop = net_analyzer.get_items_popularity(1, cutoff_rating)    
        f = open(outf_path+'items_'+dataset_domain+'.tsv','w')
        f.write("item_id\tpopularity\n")
        for item_id, pop in item_pop.iteritems():
            f.write("%s\t%s\n" %(item_id, pop))
        f.close()

        user_friends = net_analyzer.get_user_friends()
        f = open('user_friends_'+dataset_domain+'.tsv','w')
        f.write("user_id\tfriend_id\n")
        for user_id, friend_id in user_friends:
            f.write("%s\t%s\n" %(user_id, friend_id))
        f.close()
        print "Successfully stored tsv dataset"
    elif computation_cmd=="compare_interact_types":
        num_interacts_dict = net_analyzer.compare_interaction_types()
        interact_types = num_interacts_dict.keys()
        plotter.plotLinesYY(num_interacts_dict[interact_types[0]], 
                            num_interacts_dict[interact_types[1]],
                            interact_types[0], interact_types[1], 
                            display=True, logyscale=True)
         
        plotter.plotLinesYY(num_interacts_dict[interact_types[1]], 
                            num_interacts_dict[interact_types[2]],
                            interact_types[1], interact_types[2], 
                            display=True, logyscale=True)
         
        plotter.plotLinesYY(num_interacts_dict[interact_types[0]], 
                            num_interacts_dict[interact_types[2]],
                            interact_types[0], interact_types[2], 
                            display=True, logyscale=True)
    elif computation_cmd=="influence_test":
        #   ta = TemporalAnalyzer(data)
        #interact_type = data.interact_types_dict["listen"
        # time_scale can be 'w':wallclock_time or 'o':ordinal_time
        split_date_str = "2008/01/01"
        t_window = -1
        t_scale = ord('w')
        max_tries_val = 10000
        max_node_computes_val = 100
        max_interact_ratio_error = 0.1
        klim_val=5
        split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple()))
        # crate trainig test sets that will be used by fake geernation
        data.create_training_test_bytime(interact_type, split_timestamp)
        if create_fake_prefs is not None:
            print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
            fake_data.generate_fake_preferences(data,interact_type, split_timestamp, 
                        min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user,
                        time_window=t_window, time_scale=t_scale, method=create_fake_prefs)
            
            #fake_data.generate_random_preferences(data, interact_type, split_timestamp)
            print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
        # Need to generate again because fake data changes test data           
        data.create_training_test_bytime(interact_type, split_timestamp)
        
        la = LocalityAnalyzer(data)
        inf_tuple = compute.test_influence(la, interact_type=interact_type, 
                               time_diff=t_window, time_scale=ord('w'), split_timestamp=split_timestamp, 
                               #time_diff=100000, split_date_str="1970/06/23", 
                               control_divider=0.01,
                               min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user,
                               max_tries = max_tries_val, max_node_computes=max_node_computes_val, num_processes=4,
                               max_interact_ratio_error=max_interact_ratio_error,
                               klim=klim_val,
                               method="influence")
        print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3])
        num_vals = len(inf_tuple[0])
        f = open("influence_test", "w")
        for i in range(num_vals):
            f.write("%f\t%f\t%f\t%f\n" % (inf_tuple[0][i], inf_tuple[1][i], 
                        inf_tuple[2][i], inf_tuple[3][i]))
        f.close()
             
    elif computation_cmd=="suscept_test":
        use_artists = "songs" if "songs" in dataset_path else "artists"
        interact_type_str = "listen" if interact_type==0 else "love"
        #M = [50]#,20]#,30,40,50]
        t_scale = ord('o') # ordinal scale, this is the default used in paper.
        NUM_NODES_TO_COMPUTE = 4000000 # maximum number nodes to compute?
        num_threads=4 # the number of threads to spawn
        max_tries_val = None#30000 # should we stop after max_tries?
        max_node_computes_val = NUM_NODES_TO_COMPUTE/num_threads # number of nodes to compute at each node
        #max_interact_ratio_error =0.2 # these are errors (defaults are 0.1,0.1)
        #max_sim_ratio_error = 0.2
        #min_friends_match_ratio = 0.5 # important to be 1 for simulation--because e.g. in influence, we use a person's all friends to compute his next like
        klim_val = None # not used for influence test
        nonfr_match = "random" #random, serial, kbest. Default is random.
        num_loop = 1 # number of times we calculate this. For averaging results over multiple runs.
        f = open("suscept_test_results/"+dataset_domain + dataset_path.split("/")[-2] + interact_type_str+ strftime("%Y-%m-%d_%H:%M:%S")+'.dat', 'w')
        f.write("# use_artists=%r\tallow_duplicates=%r\tmax_node_computes_val=%d\tcreate_fake_prefs=%r\tnum_loop=%d\n" % (
                    use_artists, allow_duplicates, max_node_computes_val,
                        create_fake_prefs, num_loop))
        f.write("# split_train_test_date=%s\ttime_scale=%d\tmin_interactions_beforeaftersplit_per_user=%d\tnum_threads=%d\n" % (
                    split_date_str, t_scale, min_interacts_beforeaftersplit_per_user, num_threads))
        f.write("# max_interact_ratio_error=%f\tmax_sim_ratio_error=%f\tmin_friends_match_ratio=%f\n" %(
                    max_interact_ratio_error, max_sim_ratio_error, min_friends_match_ratio
                    ))
        for t_window in M:
            for h in range(num_loop):
                f.write("\n\n################### ALERTINFO: STARTING ITERATION %d  with M=%d\n" %( h, t_window))
                if split_date_str=="test": split_timestamp = 2000
                else:
                    split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple()))
                #split_timestamp=25000000
                if create_fake_prefs is not None:
                    data.create_training_test_bytime(interact_type, split_timestamp)
                    #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
                    fake_data.generate_fake_preferences(data,interact_type, split_timestamp,
                            min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user,
                            time_window=t_window, time_scale=t_scale, method=create_fake_prefs)
                    #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
                # Need to generate again because fake data changes test data           
                data.create_training_test_bytime(interact_type, split_timestamp, min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user)
                la = LocalityAnalyzer(data)
                inf_tuple = compute.test_influence(la, interact_type=interact_type, 
                                       time_diff=t_window, time_scale=t_scale, split_timestamp=split_timestamp, 
                                       #time_diff=100000, split_date_str="1970/06/23", 
                                       control_divider=0.01, # not used anymore
                                       min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user,
                                       max_tries = max_tries_val, max_node_computes=max_node_computes_val, num_threads=num_threads,
                                       max_interact_ratio_error = max_interact_ratio_error,
                                       max_sim_ratio_error = max_sim_ratio_error,
                                       min_friends_match_ratio=min_friends_match_ratio,
                                       klim = klim_val,
                                       nonfr_match=nonfr_match,
                                       method="suscept", 
                                       allow_duplicates=allow_duplicates)
                print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3])
                num_vals = len(inf_tuple[0])
                f.write("TestSetSize\tFrSimilarity\tNonFrSimilarity\tFrOverlap\tNonFrOverlap\tRandom_run_no\tM\n")
                for i in range(num_vals):
                    f.write("%d\t%f\t%f\t%f\t%f\t%d\t%d\n" % (inf_tuple[0][i], inf_tuple[1][i], 
                                inf_tuple[2][i], inf_tuple[3][i], inf_tuple[4][i], h, t_window))
        f.close()
    elif computation_cmd=="gen_adopt_data":
        t_window = 100 
        t_scale = ord('o')
        if split_date_str=="test": split_timestamp = 2000
        else:
            split_timestamp = int(time.mktime(datetime.datetime.strptime(split_date_str, "%Y/%m/%d").timetuple()))
        if create_fake_prefs is not None:
            data.create_training_test_bytime(interact_type, split_timestamp)
            #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
            fake_data.generate_fake_preferences(data,interact_type, split_timestamp,
                    min_interactions_beforeaftersplit_per_user = min_interacts_beforeaftersplit_per_user,
                    time_window=t_window, time_scale=t_scale, method=create_fake_prefs)
        
        data.create_training_test_bytime(interact_type, split_timestamp)
        gen_adopt.generate_adoption_data(data, interact_type, split_timestamp, 
            min_interactions_beforeaftersplit_per_user=min_interacts_beforeaftersplit_per_user, time_window=t_window, 
            time_scale=t_scale)
    elif computation_cmd=="compute_split_date":
        ret_timestamp = compute.compute_cutoff_date(data, interact_type, traindata_fraction)
        print ret_timestamp
        print datetime.datetime.fromtimestamp(ret_timestamp*86400).strftime("%Y-%m-%d")
    """
示例#8
0
def run_computation(data, computation_cmd, outf, interact_type,
                    create_fake_prefs, allow_duplicates, split_date_str,
                    dataset_domain, dataset_path,
                    min_interacts_beforeaftersplit_per_user,
                    max_interact_ratio_error, max_sim_ratio_error,
                    min_friends_match_ratio, traindata_fraction, M):
    net_analyzer = BasicNetworkAnalyzer(data)
    interaction_types = data.interact_types_dict
    filename_prefix = computation_cmd if computation_cmd is not None else ""

    if computation_cmd == "basic_stats" or computation_cmd is None:
        net_analyzer.show_basic_stats()
        ## use below if you want to write a new dataset (e.g. after filtering)
        data.store_ego_dataset(
            "/home/amit/datasets/social_activity_data/lastfm_filtered_listen/",
            write_maps=False)
        #data.compute_allpairs_sim(interact_type, data_type=ord("a"))

    elif computation_cmd == "random_similarity":
        for type_name, type_index in interaction_types.iteritems():
            circlesims, globalsims = net_analyzer.compare_circle_global_similarity(
                type_index, num_random_trials=5, cutoff_rating=cutoff_rating)
            #plotter.plotLinesYY(circlesims, globalsims, "Friends", "Global")
            outf.write("User_id\tcircle_sim\tnonfriend_sim\n")
            outf.write(type_name + '\n')
            for ind in range(len(circlesims)):
                outf.write("%s\t%f\t%f\n" %
                           (circlesims[ind][0], circlesims[ind][1],
                            globalsims[ind][1]))
            print "\n", type_name, ":"
            print "Circle Average", sum([v2 for v1, v2 in circlesims]) / float(
                len(circlesims))
            print "Global Average", sum([v2 for v1, v2 in globalsims]) / float(
                len(globalsims))

    elif computation_cmd == "knn_similarity":
        #Compute K-nearest similarity
        KLIMITS = [10]
        outf.write("User_id\tk\tcircle_sim\tnonfriend_sim\n")

        for type_name, type_index in interaction_types.iteritems():
            for curr_lim in KLIMITS:
                plot_circle, plot_external = net_analyzer.compare_circle_global_knnsimilarity(
                    type_index, klim=curr_lim, cutoff_rating=cutoff_rating)
                compare_sims(plot_circle, plot_external)
                outf.write(type_name + '\n')
                for ind in range(len(plot_circle)):
                    outf.write("%s\t%d\t%f\t%f\n" %
                               (plot_circle[ind][0], curr_lim,
                                plot_circle[ind][1], plot_external[ind][1]))
                #plotter.plotLinesYY(plot_circle, plot_external, "Friends", "Global")
                print type_name, "K", curr_lim
                print "Circle Average", utils.mean_sd(
                    [v2 for v1, v2 in plot_circle]), len(plot_circle)
                print "Global Average", utils.mean_sd(
                    [v2 for v1, v2 in plot_external]), len(plot_external)

    elif computation_cmd == "knn_recommender":
        #Compute K-nearest recommender
        KLIMITS = [10]
        rec_analyzer = RecommenderAnalyzer(data,
                                           max_recs_shown=10,
                                           traintest_split=0.7,
                                           cutoff_rating=cutoff_rating)
        outf.write("User_id\tk\trun_index\tcircle_ndcg\tnonfriend_ndcg\n")
        for type_name, type_index in interaction_types.iteritems():
            for curr_lim in KLIMITS:
                local_avg = []
                global_avg = []
                Ntotal = 10
                for i in range(
                        Ntotal):  # randomize because of training-test split.
                    plot_circle, plot_external = rec_analyzer.compare_knearest_recommenders(
                        type_index, klim=curr_lim, num_processes=2)
                    compare_sims(plot_circle, plot_external)
                    outf.write(type_name + "\n")
                    for ind in range(len(plot_circle)):
                        outf.write(
                            "%s\t%d\t%d\t%f\t%f\n" %
                            (plot_circle[ind][0], curr_lim, i,
                             plot_circle[ind][1], plot_external[ind][1]))
                    print "\n", type_name, "K", curr_lim

                    #print plot_circle, plot_external
                    curr_avg_local = utils.mean_sd(
                        [v2 for v1, v2 in plot_circle])
                    curr_avg_global = utils.mean_sd(
                        [v2 for v1, v2 in plot_external])
                    print "Circle Average", curr_avg_local
                    print "Global Average", curr_avg_global
                    local_avg.append(curr_avg_local[0])
                    global_avg.append(curr_avg_global[0])
                    #plotLinesYY(plot_circle, plot_external, "Friends", "Global")
                print "Local", sum(local_avg) / float(Ntotal)
                print "Global", sum(global_avg) / float(Ntotal)
    elif computation_cmd == "circle_coverage":
        lim_friends = [(5, 10), (10, 20), (20, 50), (50, 100)]
        for fr_limit in lim_friends:
            locality_analyzer = LocalityAnalyzer(data)
            coverage_list = locality_analyzer.compare_circle_item_coverages(
                0, fr_limit[0], fr_limit[1])
            plotter.plotLineY(
                sorted(coverage_list), "User",
                "Fraction of Items Covered with %d-%d friends" %
                (fr_limit[0], fr_limit[1]))
            print utils.mean_sd(coverage_list)
    elif computation_cmd == "items_edge_coverage":
        locality_analyzer = LocalityAnalyzer(data)
        items_cov_list, items_popularity, cov_ratio_list = locality_analyzer.compare_items_edge_coverage(
            1, minimum_interactions=1)
        print utils.mean_sd(items_cov_list)
        print utils.mean_sd(items_popularity)
        #plotter.plotHist(sorted([val for val in cov_ratio_list if val<=1]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
        #####plotter.plotHist(sorted([val for val in cov_ratio_list]), "Ratio of Edge coverage to total popularity", "Frequency", logyscale=True)
        #plotter.plotHist(sorted(items_popularity), "Item", "total popularity")
        plotter.plotCumulativePopularity(
            items_popularity,
            labelx="Item percentile",
            labely="Cum. percent of number of likes")
    elif computation_cmd == "network_draw":
        net_visualizor = NetworkVisualizor(data)
        net_visualizor.draw_network()
    elif computation_cmd == "network_item_adopt":
        net_visualizor = NetworkVisualizor(data)
        pprint(net_visualizor.plot_item_adoption(1669118))
    elif computation_cmd == "node_details":
        for node_id in open('user_ids'):
            if node_id.strip('\n') != "User_id":
                net_analyzer.get_node_details(int(node_id.strip('\n')))
    elif computation_cmd == "store_dataset":
        user_interacts = net_analyzer.get_user_interacts(1, cutoff_rating)
        f = open(outf_path + 'user_interacts_' + dataset_domain + '.tsv', 'w')
        f.write("user_id\titem_id\ttimestamp\n")
        for user_id, item_id, timestamp in user_interacts:
            f.write("%s\t%s\t%s\n" % (user_id, item_id, timestamp))
        f.close()

        item_pop = net_analyzer.get_items_popularity(1, cutoff_rating)
        f = open(outf_path + 'items_' + dataset_domain + '.tsv', 'w')
        f.write("item_id\tpopularity\n")
        for item_id, pop in item_pop.iteritems():
            f.write("%s\t%s\n" % (item_id, pop))
        f.close()

        user_friends = net_analyzer.get_user_friends()
        f = open('user_friends_' + dataset_domain + '.tsv', 'w')
        f.write("user_id\tfriend_id\n")
        for user_id, friend_id in user_friends:
            f.write("%s\t%s\n" % (user_id, friend_id))
        f.close()
        print "Successfully stored tsv dataset"
    elif computation_cmd == "compare_interact_types":
        num_interacts_dict = net_analyzer.compare_interaction_types()
        interact_types = num_interacts_dict.keys()
        plotter.plotLinesYY(num_interacts_dict[interact_types[0]],
                            num_interacts_dict[interact_types[1]],
                            interact_types[0],
                            interact_types[1],
                            display=True,
                            logyscale=True)

        plotter.plotLinesYY(num_interacts_dict[interact_types[1]],
                            num_interacts_dict[interact_types[2]],
                            interact_types[1],
                            interact_types[2],
                            display=True,
                            logyscale=True)

        plotter.plotLinesYY(num_interacts_dict[interact_types[0]],
                            num_interacts_dict[interact_types[2]],
                            interact_types[0],
                            interact_types[2],
                            display=True,
                            logyscale=True)
    elif computation_cmd == "influence_test":
        #   ta = TemporalAnalyzer(data)
        #interact_type = data.interact_types_dict["listen"
        # time_scale can be 'w':wallclock_time or 'o':ordinal_time
        split_date_str = "2008/01/01"
        t_window = -1
        t_scale = ord('w')
        max_tries_val = 10000
        max_node_computes_val = 100
        max_interact_ratio_error = 0.1
        klim_val = 5
        split_timestamp = int(
            time.mktime(
                datetime.datetime.strptime(split_date_str,
                                           "%Y/%m/%d").timetuple()))
        # crate trainig test sets that will be used by fake geernation
        data.create_training_test_bytime(interact_type, split_timestamp)
        if create_fake_prefs is not None:
            print data.get_nodes_list()[1].get_interactions(interact_type,
                                                            cutoff_rating=-1)
            fake_data.generate_fake_preferences(
                data,
                interact_type,
                split_timestamp,
                min_interactions_beforeaftersplit_per_user=
                min_interacts_beforeaftersplit_per_user,
                time_window=t_window,
                time_scale=t_scale,
                method=create_fake_prefs)

            #fake_data.generate_random_preferences(data, interact_type, split_timestamp)
            print data.get_nodes_list()[1].get_interactions(interact_type,
                                                            cutoff_rating=-1)
        # Need to generate again because fake data changes test data
        data.create_training_test_bytime(interact_type, split_timestamp)

        la = LocalityAnalyzer(data)
        inf_tuple = compute.test_influence(
            la,
            interact_type=interact_type,
            time_diff=t_window,
            time_scale=ord('w'),
            split_timestamp=split_timestamp,
            #time_diff=100000, split_date_str="1970/06/23",
            control_divider=0.01,
            min_interactions_beforeaftersplit_per_user=
            min_interacts_beforeaftersplit_per_user,
            max_tries=max_tries_val,
            max_node_computes=max_node_computes_val,
            num_processes=4,
            max_interact_ratio_error=max_interact_ratio_error,
            klim=klim_val,
            method="influence")
        print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3])
        num_vals = len(inf_tuple[0])
        f = open("influence_test", "w")
        for i in range(num_vals):
            f.write("%f\t%f\t%f\t%f\n" % (inf_tuple[0][i], inf_tuple[1][i],
                                          inf_tuple[2][i], inf_tuple[3][i]))
        f.close()

    elif computation_cmd == "suscept_test":
        use_artists = "songs" if "songs" in dataset_path else "artists"
        interact_type_str = "listen" if interact_type == 0 else "love"
        #M = [50]#,20]#,30,40,50]
        t_scale = ord('o')  # ordinal scale, this is the default used in paper.
        NUM_NODES_TO_COMPUTE = 4000000  # maximum number nodes to compute?
        num_threads = 4  # the number of threads to spawn
        max_tries_val = None  #30000 # should we stop after max_tries?
        max_node_computes_val = NUM_NODES_TO_COMPUTE / num_threads  # number of nodes to compute at each node
        #max_interact_ratio_error =0.2 # these are errors (defaults are 0.1,0.1)
        #max_sim_ratio_error = 0.2
        #min_friends_match_ratio = 0.5 # important to be 1 for simulation--because e.g. in influence, we use a person's all friends to compute his next like
        klim_val = None  # not used for influence test
        nonfr_match = "random"  #random, serial, kbest. Default is random.
        num_loop = 1  # number of times we calculate this. For averaging results over multiple runs.
        f = open(
            "suscept_test_results/" + dataset_domain +
            dataset_path.split("/")[-2] + interact_type_str +
            strftime("%Y-%m-%d_%H:%M:%S") + '.dat', 'w')
        f.write(
            "# use_artists=%r\tallow_duplicates=%r\tmax_node_computes_val=%d\tcreate_fake_prefs=%r\tnum_loop=%d\n"
            % (use_artists, allow_duplicates, max_node_computes_val,
               create_fake_prefs, num_loop))
        f.write(
            "# split_train_test_date=%s\ttime_scale=%d\tmin_interactions_beforeaftersplit_per_user=%d\tnum_threads=%d\n"
            % (split_date_str, t_scale,
               min_interacts_beforeaftersplit_per_user, num_threads))
        f.write(
            "# max_interact_ratio_error=%f\tmax_sim_ratio_error=%f\tmin_friends_match_ratio=%f\n"
            % (max_interact_ratio_error, max_sim_ratio_error,
               min_friends_match_ratio))
        for t_window in M:
            for h in range(num_loop):
                f.write(
                    "\n\n################### ALERTINFO: STARTING ITERATION %d  with M=%d\n"
                    % (h, t_window))
                if split_date_str == "test": split_timestamp = 2000
                else:
                    split_timestamp = int(
                        time.mktime(
                            datetime.datetime.strptime(
                                split_date_str, "%Y/%m/%d").timetuple()))
                #split_timestamp=25000000
                if create_fake_prefs is not None:
                    data.create_training_test_bytime(interact_type,
                                                     split_timestamp)
                    #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
                    fake_data.generate_fake_preferences(
                        data,
                        interact_type,
                        split_timestamp,
                        min_interactions_beforeaftersplit_per_user=
                        min_interacts_beforeaftersplit_per_user,
                        time_window=t_window,
                        time_scale=t_scale,
                        method=create_fake_prefs)
                    #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
                # Need to generate again because fake data changes test data
                data.create_training_test_bytime(
                    interact_type,
                    split_timestamp,
                    min_interactions_beforeaftersplit_per_user=
                    min_interacts_beforeaftersplit_per_user)
                la = LocalityAnalyzer(data)
                inf_tuple = compute.test_influence(
                    la,
                    interact_type=interact_type,
                    time_diff=t_window,
                    time_scale=t_scale,
                    split_timestamp=split_timestamp,
                    #time_diff=100000, split_date_str="1970/06/23",
                    control_divider=0.01,  # not used anymore
                    min_interactions_beforeaftersplit_per_user=
                    min_interacts_beforeaftersplit_per_user,
                    max_tries=max_tries_val,
                    max_node_computes=max_node_computes_val,
                    num_threads=num_threads,
                    max_interact_ratio_error=max_interact_ratio_error,
                    max_sim_ratio_error=max_sim_ratio_error,
                    min_friends_match_ratio=min_friends_match_ratio,
                    klim=klim_val,
                    nonfr_match=nonfr_match,
                    method="suscept",
                    allow_duplicates=allow_duplicates)
                print "t-test results", ttest_rel(inf_tuple[2], inf_tuple[3])
                num_vals = len(inf_tuple[0])
                f.write(
                    "TestSetSize\tFrSimilarity\tNonFrSimilarity\tFrOverlap\tNonFrOverlap\tRandom_run_no\tM\n"
                )
                for i in range(num_vals):
                    f.write("%d\t%f\t%f\t%f\t%f\t%d\t%d\n" %
                            (inf_tuple[0][i], inf_tuple[1][i], inf_tuple[2][i],
                             inf_tuple[3][i], inf_tuple[4][i], h, t_window))
        f.close()
    elif computation_cmd == "gen_adopt_data":
        t_window = 100
        t_scale = ord('o')
        if split_date_str == "test": split_timestamp = 2000
        else:
            split_timestamp = int(
                time.mktime(
                    datetime.datetime.strptime(split_date_str,
                                               "%Y/%m/%d").timetuple()))
        if create_fake_prefs is not None:
            data.create_training_test_bytime(interact_type, split_timestamp)
            #print data.get_nodes_list()[1].get_interactions(interact_type, cutoff_rating=-1)
            fake_data.generate_fake_preferences(
                data,
                interact_type,
                split_timestamp,
                min_interactions_beforeaftersplit_per_user=
                min_interacts_beforeaftersplit_per_user,
                time_window=t_window,
                time_scale=t_scale,
                method=create_fake_prefs)

        data.create_training_test_bytime(interact_type, split_timestamp)
        gen_adopt.generate_adoption_data(
            data,
            interact_type,
            split_timestamp,
            min_interactions_beforeaftersplit_per_user=
            min_interacts_beforeaftersplit_per_user,
            time_window=t_window,
            time_scale=t_scale)
    elif computation_cmd == "compute_split_date":
        ret_timestamp = compute.compute_cutoff_date(data, interact_type,
                                                    traindata_fraction)
        print ret_timestamp
        print datetime.datetime.fromtimestamp(ret_timestamp *
                                              86400).strftime("%Y-%m-%d")
    """