def test_rank_results(self): from pygrank.algorithms.pagerank import PageRank as Ranker from pygrank.algorithms.utils import preprocessor G = create_test_graph() test_result = Ranker(to_scipy=preprocessor('col')).rank(G) nx_result = nx.pagerank_scipy(G) abs_diffs = sum(abs(test_result[v]-nx_result[v]) for v in nx_result.keys())/len(nx_result) self.assertAlmostEqual(abs_diffs, 0, places=16, msg="PageRank compliance with nx results")
def test_immutability_speedup(self): from pygrank.algorithms.pagerank import PageRank as Ranker from pygrank.algorithms.utils import preprocessor import scipy.stats nx_time = list() test_time = list() repeats = 50 G = create_test_graph() ranker = Ranker(to_scipy=preprocessor('col')) tic = time.clock() for _ in range(repeats): ranker.rank(G) unhashed_time = time.clock()-tic ranker = Ranker(to_scipy=preprocessor('col', assume_immutability=True)) tic = time.clock() for _ in range(repeats): ranker.rank(G) hashed_time = time.clock()-tic self.assertLessEqual(hashed_time, unhashed_time, msg="Hashing speedup")
def test_rank_time(self): from pygrank.algorithms.pagerank import PageRank as ranker from pygrank.algorithms.utils import preprocessor import scipy.stats nx_time = list() test_time = list() repeats = 50 for _ in range(repeats): G = create_test_graph() tic = time.clock() ranker(to_scipy=preprocessor('col')).rank(G) test_time.append(time.clock()-tic) tic = time.clock() nx.pagerank_scipy(G) nx_time.append(time.clock()-tic) self.assertLessEqual(scipy.stats.ttest_ind(nx_time, test_time)[1], 0.001, msg="PageRank time comparable to nx with p-value<0.001")
from scipy.stats import rankdata ranks = rankdata(ranks) ground_truth = rankdata(ground_truth) plt.scatter(ground_truth, ranks) plt.grid() plt.show() # CHANGE THE FOLLOWING BLOCK TO SELECT DATASET specific_ids = [1723] # community ids dataset = 'snap_amazon' # dataset dataset_name = dataset G, groups = import_SNAP_data(dataset, specific_ids=specific_ids)#left one is amazon, right is dblp pre = preprocessor('col', assume_immutability=True) # a preprocessor that hashes the outcome of normalization for faster running time of the same algoriths pre(G) # run once the preprocessor to not affect potential time measurements result_spearmans = "" result_iterations = "" for group_number in range(len(groups)): for alpha in [0.85, 0.90, 0.95, 0.99, 0.995, 0.999]: result_spearmans += dataset_name+"-"+str(specific_ids[group_number])+" & & "+(str(alpha)[1:]) result_iterations += dataset_name+"-"+str(specific_ids[group_number])+" & & "+(str(alpha)[1:]) seeds = {v:1 for v in groups[group_number]} ground_truth_ranker = PageRank(alpha=alpha, to_scipy=pre, tol=1.E-20, max_iters=30000, use_quotient=False) ground_truth_ranks = ground_truth_ranker.rank(G, seeds) result_iterations += " & "+str(ground_truth_ranker.convergence.iteration) print("Found ground truth ranks ("+str(ground_truth_ranker.convergence.iteration)+" iterations)") compared_rankers = list()
max_iters = 10000 for dataset_name in datasets: G, groups = import_SNAP_data( dataset_name, min_group_size=5000) #12000 for dblp, 5000 for amazon group_sets = [set(group) for group in groups.values()] for group in group_sets: print(len(group)) count = sum( 1 for u, v in G.edges() if sum(1 for group in group_sets if u in group and v in group) > 0) print('Homophily', count / float(G.number_of_edges())) seeds = [0.001, 0.01, 0.1, 0.25, 0.5] print('Number of groups', len(groups)) for seed in seeds: pre = preprocessor('col', assume_immutability=True) preL = preprocessor('symmetric', assume_immutability=True) pre(G) tol = 1.E-6 base_algorithms = { "PPRL 0.85": pygrank.algorithms.pagerank.PageRank(alpha=0.85, to_scipy=preL, max_iters=max_iters, tol=tol), "PPRL 0.90": pygrank.algorithms.pagerank.PageRank(alpha=0.9, to_scipy=preL, max_iters=max_iters, tol=tol), "PPRL 0.95":