Пример #1
0
def test_all_communities_benchmarks():
    datasets = ["bigraph"]
    pre = pg.preprocessor(assume_immutability=True, normalization="symmetric")
    tol = 1.E-9
    optimization = pg.SelfClearDict()
    algorithms = {
        "ppr0.85": pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=tol),
        "ppr0.9": pg.PageRank(alpha=0.9, preprocessor=pre, max_iters=10000, tol=tol),
        "ppr0.99": pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=tol),
        "hk3": pg.HeatKernel(t=3, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization),
        "hk5": pg.HeatKernel(t=5, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization),
        "hk7": pg.HeatKernel(t=7, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization),
    }

    tuned = {"selected": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.8)}
    loader = pg.load_datasets_all_communities(datasets, min_group_size=50)
    pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.AUC, fraction_of_training=.8, seed=list(range(1))),
                       decimals=3, delimiter=" & ", end_line="\\\\")
    loader = pg.load_datasets_all_communities(datasets, min_group_size=50)
    pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.Modularity, sensitive=pg.pRule, fraction_of_training=.8, seed=list(range(1))),
                       decimals=3, delimiter=" & ", end_line="\\\\")
    mistreatment = lambda known_scores, sensitive_signal, exclude: \
        pg.AM([pg.Disparity([pg.TPR(known_scores, exclude=1 - (1 - exclude.np) * sensitive_signal.np),
                             pg.TPR(known_scores, exclude=1 - (1 - exclude.np) * (1 - sensitive_signal.np))]),
               pg.Disparity([pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * sensitive_signal.np),
                             pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * (1 - sensitive_signal.np))])])
    loader = pg.load_datasets_all_communities(datasets, min_group_size=50)
    pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.Modularity, sensitive=mistreatment, fraction_of_training=.8, seed=list(range(1))),
                       decimals=3, delimiter=" & ", end_line="\\\\")
Пример #2
0
def test_unsupervised_vs_auc():
    def loader():
        return pg.load_datasets_multiple_communities(["graph9"])

    algorithms = pg.create_variations(pg.create_many_filters(), pg.create_many_variation_types())
    time_scores = pg.benchmark_scores(pg.benchmark(algorithms, loader(), pg.Time))
    assert sum(time_scores) > 0

    measures = {"AUC": lambda ground_truth, exlude: pg.MultiSupervised(pg.AUC, ground_truth, exlude),
                "NDCG": lambda ground_truth, exlude: pg.MultiSupervised(pg.NDCG, ground_truth, exlude),
                "Density": lambda graph: pg.MultiUnsupervised(pg.Density, graph),
                "Conductance": lambda graph: pg.MultiUnsupervised(pg.Conductance(autofix=True).as_unsupervised_method(), graph),
                "Modularity": lambda graph: pg.MultiUnsupervised(pg.Modularity(max_positive_samples=5).as_unsupervised_method(), graph),
                "CCcos": lambda graph: pg.ClusteringCoefficient(graph, similarity="cos", max_positive_samples=5),
                "CCdot": lambda graph: pg.ClusteringCoefficient(graph, similarity="dot", max_positive_samples=5),
                "LinkAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos", max_positive_samples=5),
                "LinkAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", max_positive_samples=5),
                "HopAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos", hops=2, max_positive_samples=5),
                "HopAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", hops=2, max_positive_samples=5),
                }

    scores = {}#measure: pg.benchmark_scores(pg.benchmark(algorithms, loader(), measures[measure])) for measure in measures}
    for measure in measures:  # do this as a for loop, because pytest becomes a little slow above list comprehension
        scores[measure] = pg.benchmark_scores(pg.benchmark(algorithms, loader(), measures[measure]))
    supervised = {"AUC", "NDCG"}
    evaluations = dict()
    for measure in measures:
        evaluations[measure] = abs(pg.SpearmanCorrelation(scores["AUC"])(scores[measure]))
    #for measure in measures:
    #    print(measure, evaluations[measure])
    assert max([evaluations[measure] for measure in measures if measure not in supervised]) == evaluations["LinkAUCdot"]
Пример #3
0
def test_multigroup_benchmarks():
    datasets = ["bigraph"]
    pre = pg.preprocessor(assume_immutability=True, normalization="symmetric")
    tol = 1.E-9
    optimization = pg.SelfClearDict()
    algorithms = {
        "ppr0.85":
        pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=tol),
        "ppr0.9":
        pg.PageRank(alpha=0.9, preprocessor=pre, max_iters=10000, tol=tol),
        "ppr0.99":
        pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=tol),
        "hk3":
        pg.HeatKernel(t=3,
                      preprocessor=pre,
                      max_iters=10000,
                      tol=tol,
                      optimization_dict=optimization),
        "hk5":
        pg.HeatKernel(t=5,
                      preprocessor=pre,
                      max_iters=10000,
                      tol=tol,
                      optimization_dict=optimization),
        "hk7":
        pg.HeatKernel(t=7,
                      preprocessor=pre,
                      max_iters=10000,
                      tol=tol,
                      optimization_dict=optimization),
    }

    tuned = {
        "selected":
        pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.8)
    }
    loader = pg.load_datasets_multiple_communities(datasets, min_group_size=50)
    pg.benchmark_print(pg.benchmark(
        algorithms | tuned,
        loader,
        lambda ground_truth, exclude: pg.MultiSupervised(
            pg.AUC, ground_truth, exclude),
        fraction_of_training=.8,
        seed=list(range(1))),
                       decimals=3,
                       delimiter=" & ",
                       end_line="\\\\")
    loader = pg.load_datasets_multiple_communities(datasets, min_group_size=50)
    pg.benchmark_print(pg.benchmark(algorithms | tuned,
                                    loader,
                                    pg.Modularity,
                                    sensitive=pg.pRule,
                                    fraction_of_training=.8,
                                    seed=list(range(1))),
                       decimals=3,
                       delimiter=" & ",
                       end_line="\\\\")
Пример #4
0
def test_benchmark_print():
    assert pg.benchmarks.utils._fraction2str(0.1) == ".10"
    assert pg.benchmarks.utils._fraction2str(0.00001) == "0"
    assert pg.benchmarks.utils._fraction2str(1) == "1.00"
    loader = pg.load_datasets_one_community(["graph9", "bigraph"])
    console = pg.benchmark_print(pg.benchmark(pg.create_demo_filters(), loader),
                                 out=io.StringIO(""), err=None).getvalue()
    loader = pg.load_datasets_one_community(["graph9", "bigraph"])
    ret = pg.benchmark_dict(pg.benchmark(pg.create_demo_filters(), loader, sensitive=pg.MannWhitneyParity))
    assert isinstance(ret, dict)
    assert len(ret) == 3
    assert isinstance(ret["graph9"], dict)
    assert (len(str(ret)) - len(console)) < (len(str(ret)) + len(console))/2
Пример #5
0
def test_one_community_benchmarks():
    pg.load_backend("numpy")
    datasets = ["graph9", "bigraph"]
    pre = pg.preprocessor(assume_immutability=True, normalization="symmetric")
    algorithms = {
        "ppr0.85":
        pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=1.E-9),
        "ppr0.99":
        pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=1.E-9),
        "hk3":
        pg.HeatKernel(t=3, preprocessor=pre, max_iters=10000, tol=1.E-9),
        "hk5":
        pg.HeatKernel(t=5, preprocessor=pre, max_iters=10000, tol=1.E-9),
        "tuned":
        pg.ParameterTuner(preprocessor=pre, max_iters=10000, tol=1.E-9),
    }
    # algorithms = benchmark.create_variations(algorithms, {"": pg.Tautology, "+SO": pg.SeedOversampling})
    # loader = pg.load_datasets_one_community(datasets)
    # pg.benchmark(algorithms, loader, "time", verbose=True)

    loader = pg.load_datasets_one_community(datasets)
    pg.benchmark_print(
        pg.benchmark_average(
            pg.benchmark_ranks(
                pg.benchmark(algorithms,
                             loader,
                             pg.AUC,
                             fraction_of_training=.8))))
Пример #6
0
                            .8,
                            pRule_weight=10,
                            max_residual=1,
                            error_type=pg.Mabs,
                            error_skewing=False,
                            parameter_buckets=1,
                            parity_type="impact")
        #"FFfix-C": pg.FairTradeoff(filter, .8, pRule_weight=10, error_type=pg.Mabs)
        #"FairTf": pg.FairnessTf(filter)
    }
    algorithms = pg.create_variations(algorithms, {"": pg.Normalize})

    #import cProfile as profile
    #pr = profile.Profile()
    #pr.enable()
    mistreatment = lambda known_scores, sensitive_signal, exclude: \
        pg.AM([pg.Disparity([pg.TPR(known_scores, exclude=1-(1-exclude.np)*sensitive_signal.np),
                             pg.TPR(known_scores, exclude=1-(1-exclude.np)*(1-sensitive_signal.np))]),
               pg.Disparity([pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * sensitive_signal.np),
                             pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * (1 - sensitive_signal.np))])])
    pg.benchmark_print(pg.benchmark(algorithms,
                                    pg.load_datasets_multiple_communities(
                                        datasets, max_group_number=2),
                                    metric=pg.AUC,
                                    sensitive=pg.pRule,
                                    fraction_of_training=seed_fractions),
                       delimiter=" & ",
                       end_line="\\\\")

    #pr.disable()
    #pr.dump_stats('profile.pstat')
Пример #7
0
             personalization: pg.GraphSignalData = None,
             **kwargs):
        personalization = pg.to_signal(graph, personalization)
        graph = personalization.graph
        ranks = self.ranker(personalization)
        ret = 0
        total_sum = pg.sum(ranks)
        accum_sum = 0
        for threshold in sorted(ranks.values()):
            accum_sum += threshold
            if accum_sum > total_sum * 0.1:
                break
        for i, v in enumerate(ranks):
            pg.utils.log(f"{i}/{len(ranks)}")
            if ranks[v] >= threshold:
                partial = ranks >> pg.Threshold(ranks[v],
                                                inclusive=True) >> self.ranker
                ret = partial * ranks[v] + ret
        return ret


algs = {
    "ppr": pg.PageRank(0.9),
    "ppr+so": pg.PageRank(0.9) >> pg.SeedOversampling(),
    "ppr+bso": pg.PageRank(0.9) >> pg.BoostedSeedOversampling(),
    "ppr+sso": pg.PageRank(0.9) >> StochasticSeedOversampling(),
}

loader = pg.load_datasets_one_community(["citeseer"])
pg.benchmark_print(pg.benchmark(algs, loader, pg.AUC, 3))
Пример #8
0
import pygrank as pg
datasets = ["EUCore", "Amazon"]
pre = pg.preprocessor(assume_immutability=True, normalization="symmetric")
algs = {
    "ppr.85": pg.PageRank(.85, preprocessor=pre, tol=1.E-9, max_iters=1000),
    "ppr.99": pg.PageRank(.99, preprocessor=pre, tol=1.E-9, max_iters=1000),
    "hk3": pg.HeatKernel(3, preprocessor=pre, tol=1.E-9, max_iters=1000),
    "hk5": pg.HeatKernel(5, preprocessor=pre, tol=1.E-9, max_iters=1000),
}

algs = algs | pg.create_variations(algs, {"+Sweep": pg.Sweep})
loader = pg.load_datasets_one_community(datasets)
algs["tuned"] = pg.ParameterTuner(preprocessor=pre, tol=1.E-9, max_iters=1000)
algs["selected"] = pg.AlgorithmSelection(
    pg.create_demo_filters(preprocessor=pre, tol=1.E-9,
                           max_iters=1000).values())
algs["tuned+Sweep"] = pg.ParameterTuner(
    ranker_generator=lambda params: pg.Sweep(
        pg.GenericGraphFilter(
            params, preprocessor=pre, tol=1.E-9, max_iters=1000)))

for alg in algs.values():
    print(alg.cite())  # prints a list of algorithm citations

pg.benchmark_print(pg.benchmark(algs, loader, pg.AUC, fraction_of_training=.5),
                   delimiter=" & ",
                   end_line="\\\\")
Пример #9
0
import pygrank as pg

loader = list(pg.load_datasets_multiple_communities(["bigraph", "cora", "citeseer"]))
algorithms = pg.create_variations(pg.create_demo_filters(), pg.create_many_variation_types())
algorithms = pg.create_variations(algorithms, pg.Normalize)  # add normalization to all algorithms
print("Algorithms", len(algorithms))

measures = {"AUC": lambda ground_truth, exlude: pg.MultiSupervised(pg.AUC, ground_truth, exlude),
            "NDCG": lambda ground_truth, exlude: pg.MultiSupervised(pg.NDCG, ground_truth, exlude),
            "Density": lambda graph: pg.MultiUnsupervised(pg.Density, graph),
            "Modularity": lambda graph: pg.MultiUnsupervised(pg.Modularity, graph),
            "LinkCC": lambda graph: pg.ClusteringCoefficient(graph, similarity="dot"),
            "LinkAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos"),
            "HopAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", hops=2),
            }

scores = {measure: pg.benchmark_scores(pg.benchmark(algorithms, loader, measures[measure])) for measure in measures}
evaluations_vs_auc = dict()
evaluations_vs_ndcg = dict()
for measure in measures:
    evaluations_vs_auc[measure] = abs(pg.SpearmanCorrelation(scores["AUC"])(scores[measure]))
    evaluations_vs_ndcg[measure] = abs(pg.SpearmanCorrelation(scores["NDCG"])(scores[measure]))

pg.benchmark_print([("Measure", "AUC corr", "NDCG corr")]
                   + [(measure, evaluations_vs_auc[measure], evaluations_vs_ndcg[measure]) for measure in measures])
Пример #10
0
                              pg.Normalize(
                                  postprocessor(
                                      pg.GenericGraphFilter([1]+params,
                                                            preprocessor=pre,
                                                            error_type="iters",
                                                            max_iters=41,
                                                            optimization_dict=optimization,
                                                            preserve_norm=False))),
                             deviation_tol=1.E-6,
                             measure=measure,
                             optimizer=optimizer,
                             max_vals=[1]*40,
                             min_vals=[0]*40)


tuned = {
   "select": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.9, measure=measure),
   "tune": create_param_tuner(),
   "tuneLBFGSB": create_param_tuner(pg.lbfgsb)
}

for name, graph, group in pg.load_datasets_all_communities(datasets, min_group_size=community_size, max_group_number=3):
    print(" & ".join([str(val) for val in [name, len(graph), graph.number_of_edges(), len(group)]])+" \\\\")

loader = pg.load_datasets_all_communities(datasets, min_group_size=community_size, max_group_number=3)
pg.benchmark_print(
    pg.benchmark_average((pg.benchmark(algorithms | tuned, loader, measure,
                                       fraction_of_training=[0.1, 0.2, 0.3], seed=list(range(1)))), posthocs=True),
    decimals=3, delimiter=" & ", end_line="\\\\")

Пример #11
0
tuned = {
    "selected":
    pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.8),
    #"tuned": pg.ParameterTuner(preprocessor=pre, fraction_of_training=0.8, tol=tol, optimization_dict=optimization, measure=pg.AUC),
    "arnoldi":
    pg.HopTuner(preprocessor=pre,
                basis="arnoldi",
                measure=pg.Cos,
                tol=tol,
                optimization_dict=optimization),
    #"arnoldi2": pg.ParameterTuner(lambda params: pg.HopTuner(preprocessor=pre, basis="arnoldi", num_parameters=int(params[0]),
    #                                                         measure=pg.Cos,
    #                                                         tol=tol, optimization_dict=optimization, tunable_offset=None),
    #                              max_vals=[40], min_vals=[5], divide_range=2, fraction_of_training=0.1),
}

#algorithms = pg.create_variations(algorithms, {"": pg.Tautology, "+Sweep": pg.Sweep})
#print(algorithms.keys())

#for name, graph, group in pg.load_datasets_all_communities(datasets, min_group_size=50):
#    print(" & ".join([str(val) for val in [name, len(graph), graph.number_of_edges(), len(group)]])+" \\\\")
loader = pg.load_datasets_all_communities(datasets, min_group_size=50)
pg.benchmark_print(pg.benchmark(algorithms | tuned,
                                loader,
                                pg.AUC,
                                fraction_of_training=.8,
                                seed=list(range(1))),
                   decimals=3,
                   delimiter=" & ",
                   end_line="\\\\")