Exemplo n.º 1
0
def test_auc_ndcg_compliance():
    _, graph, group = next(pg.load_datasets_one_community(["bigraph"]))
    training, test = pg.split(group, 0.5)
    for _ in supported_backends():
        scores1 = pg.PageRank()(graph, training)
        scores2 = pg.HeatKernel()(graph, training)
        AUC1 = pg.AUC(test, exclude=training)(scores1)
        AUC2 = pg.AUC(test, exclude=training)(scores2)
        NDCG1 = float(pg.NDCG(test, exclude=training)(scores1))
        NDCG2 = float(pg.NDCG(test, exclude=training)(scores2))
        assert (AUC1 < AUC2) == (NDCG1 < NDCG2)
        with pytest.raises(Exception):
            pg.AUC(test, exclude=test, k=len(graph) + 1)(scores2)
        with pytest.raises(Exception):
            pg.NDCG(test, exclude=training, k=len(graph) + 1)(scores2)
Exemplo n.º 2
0
def test_krylov_space_oversampling():
    # this demonstrates a highly complicated setting
    _, graph, community = next(pg.load_datasets_one_community(["bigraph"]))
    algorithm = pg.HeatKernel(
        t=5,  # the number of hops away HeatKernel places maximal importance on
        krylov_dims=5,
        normalization="symmetric",
        renormalize=True)
    for _ in supported_backends():
        personalization = {node: 1. for node in list(community)[:10]}
        oversampling = pg.SeedOversampling(algorithm)
        pg.Normalize(oversampling)(graph, personalization)
        measure = pg.Conductance()
        assert measure(pg.Normalize(algorithm)(
            graph, personalization)) >= measure(
                pg.Normalize(oversampling)(graph, personalization)) - 5.E-6
Exemplo n.º 3
0
def test_one_community_benchmarks():
    pg.load_backend("numpy")
    datasets = ["graph9", "bigraph"]
    pre = pg.preprocessor(assume_immutability=True, normalization="symmetric")
    algorithms = {
        "ppr0.85": pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=1.E-9),
        "ppr0.99": pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=1.E-9),
        "hk3": pg.HeatKernel(t=3, preprocessor=pre, max_iters=10000, tol=1.E-9),
        "hk5": pg.HeatKernel(t=5, preprocessor=pre, max_iters=10000, tol=1.E-9),
        "tuned": pg.ParameterTuner(preprocessor=pre, max_iters=10000, tol=1.E-9),
    }
    # algorithms = benchmark.create_variations(algorithms, {"": pg.Tautology, "+SO": pg.SeedOversampling})
    # loader = pg.load_datasets_one_community(datasets)
    # pg.benchmark(algorithms, loader, "time", verbose=True)

    loader = pg.load_datasets_one_community(datasets)
    pg.benchmark_print(pg.benchmark(algorithms, loader, pg.AUC, fraction_of_training=.8))
Exemplo n.º 4
0
def test_seed_top():
    _, graph, group = next(pg.load_datasets_one_community(["bigraph"]))
    for _ in supported_backends():
        training, evaluation = pg.split(list(group), training_samples=2)
        original_training = set(training)
        from random import random, seed
        seed(0)
        training, evaluation = pg.to_signal(graph, {v: 1 for v in graph if v in original_training or random() < 0.5}), \
                               pg.to_signal(graph, {v: 1 for v in evaluation})
        for measure in [pg.AUC, pg.NDCG]:
            #ranks = pg.PageRank(0.9, max_iters=1000).rank(graph, training)
            #base_result = measure(evaluation, list(original_training)).evaluate(ranks)
            ranks = pg.Top(pg.Sweep(pg.PageRank(0.9, max_iters=1000)),
                           0.9).rank(graph, training)
            undersampled_result1 = measure(
                evaluation, list(original_training)).evaluate(ranks)
            ranks = pg.Top(2, pg.Sweep(pg.PageRank(0.9, max_iters=1000))).rank(
                graph, training)
            undersampled_result2 = measure(
                evaluation, list(original_training)).evaluate(ranks)
Exemplo n.º 5
0
def test_sweep():
    _, graph, group = next(pg.load_datasets_one_community(["bigraph"]))
    for _ in supported_backends():
        training, evaluation = pg.split(list(group), training_samples=0.1)
        auc1 = pg.AUC({v: 1
                       for v in evaluation}, exclude=training).evaluate(
                           pg.Sweep(pg.PageRank()).rank(
                               graph, {v: 1
                                       for v in training}))
        auc2 = pg.AUC({v: 1
                       for v in evaluation},
                      exclude=training).evaluate(pg.PageRank().rank(
                          graph, {v: 1
                                  for v in training}))
        auc3 = pg.AUC({v: 1
                       for v in evaluation}, exclude=training).evaluate(
                           pg.LinearSweep(pg.Transformer(
                               pg.PageRank(),
                               pg.log)).rank(graph, {v: 1
                                                     for v in training}))
        assert auc1 > auc2
        assert auc1 == auc3
Exemplo n.º 6
0
def test_threshold():
    _, graph, group = next(pg.load_datasets_one_community(["bigraph"]))
    for _ in supported_backends():
        training, evaluation = pg.split(list(group), training_samples=0.5)
        algorithm = pg.PageRank()
        cond1 = pg.Conductance().evaluate(
            pg.Threshold(pg.Sweep(algorithm),
                         "gap").rank(graph, {v: 1
                                             for v in training}))
        cond2 = pg.Conductance().evaluate(
            pg.Threshold(0.3).transform(
                algorithm.rank(graph,
                               {v: 1
                                for v in training})))  # try all api types
        cond3 = pg.Conductance().evaluate(
            pg.Threshold(1).transform(
                algorithm.rank(
                    graph,
                    {v: 1
                     for v in training})))  # should yield infinite conductance
        # TODO: find an algorithm other than gap to outperform 0.2 threshold too
        assert cond1 <= cond2
        assert cond2 <= cond3
Exemplo n.º 7
0
             personalization: pg.GraphSignalData = None,
             **kwargs):
        personalization = pg.to_signal(graph, personalization)
        graph = personalization.graph
        ranks = self.ranker(personalization)
        ret = 0
        total_sum = pg.sum(ranks)
        accum_sum = 0
        for threshold in sorted(ranks.values()):
            accum_sum += threshold
            if accum_sum > total_sum * 0.1:
                break
        for i, v in enumerate(ranks):
            pg.utils.log(f"{i}/{len(ranks)}")
            if ranks[v] >= threshold:
                partial = ranks >> pg.Threshold(ranks[v],
                                                inclusive=True) >> self.ranker
                ret = partial * ranks[v] + ret
        return ret


algs = {
    "ppr": pg.PageRank(0.9),
    "ppr+so": pg.PageRank(0.9) >> pg.SeedOversampling(),
    "ppr+bso": pg.PageRank(0.9) >> pg.BoostedSeedOversampling(),
    "ppr+sso": pg.PageRank(0.9) >> StochasticSeedOversampling(),
}

loader = pg.load_datasets_one_community(["citeseer"])
pg.benchmark_print(pg.benchmark(algs, loader, pg.AUC, 3))
Exemplo n.º 8
0
import pygrank as pg
datasets = ["EUCore", "Amazon"]
pre = pg.preprocessor(assume_immutability=True, normalization="symmetric")
algs = {
    "ppr.85": pg.PageRank(.85, preprocessor=pre, tol=1.E-9, max_iters=1000),
    "ppr.99": pg.PageRank(.99, preprocessor=pre, tol=1.E-9, max_iters=1000),
    "hk3": pg.HeatKernel(3, preprocessor=pre, tol=1.E-9, max_iters=1000),
    "hk5": pg.HeatKernel(5, preprocessor=pre, tol=1.E-9, max_iters=1000),
}

algs = algs | pg.create_variations(algs, {"+Sweep": pg.Sweep})
loader = pg.load_datasets_one_community(datasets)
algs["tuned"] = pg.ParameterTuner(preprocessor=pre, tol=1.E-9, max_iters=1000)
algs["selected"] = pg.AlgorithmSelection(
    pg.create_demo_filters(preprocessor=pre, tol=1.E-9,
                           max_iters=1000).values())
algs["tuned+Sweep"] = pg.ParameterTuner(
    ranker_generator=lambda params: pg.Sweep(
        pg.GenericGraphFilter(
            params, preprocessor=pre, tol=1.E-9, max_iters=1000)))

for alg in algs.values():
    print(alg.cite())  # prints a list of algorithm citations

pg.benchmark_print(pg.benchmark(algs, loader, pg.AUC, fraction_of_training=.5),
                   delimiter=" & ",
                   end_line="\\\\")
Exemplo n.º 9
0
                         {v: 1
                          for v in known_members
                          }) >> pg.Sweep(graph_filter) >> pg.Normalize("range")
    if top is not None:
        ranks = ranks * (1 - pg.to_signal(graph, {v: 1
                                                  for v in known_members})
                         )  # set known member scores to zero
        return sorted(list(graph), key=lambda node: -ranks[node]
                      )[:top]  # return specific number of top predictions

    threshold = pg.optimize(max_vals=[1],
                            loss=lambda p: pg.Conductance(graph)
                            (pg.Threshold(p[0]).transform(ranks)))[0]
    known_members = set(known_members)
    return [
        v for v in graph if ranks[v] > threshold and v not in known_members
    ]


_, graph, group = next(pg.load_datasets_one_community(["citeseer"]))
print(len(group))
train, test = pg.split(group, 0.1)
found = overlapping_community_detection(graph, train)

# node-based evaluation (we work on the returned list of nodes instead of graph signals)
test = set(test)
TP = len([v for v in found if v in test])
print("Precision", TP / len(found))
print("Recall", TP / len(test))
print("Match size", len(found) / len(test))
Exemplo n.º 10
0
import pygrank as pg
_, graph, community = next(pg.load_datasets_one_community(["EUCore"]))
algorithm = pg.HeatKernel(
    t=5,  # the number of hops away HeatKernel places maximal importance on
    normalization="symmetric",
    renormalize=True)
personalization = {node: 1.
                   for node in community}  # ignored nodes assumed to be zeroes
algorithms = {
    "HK5": algorithm,
    "HK5+Oversampling": pg.SeedOversampling(algorithm)
}
algorithms = algorithms | pg.create_variations(algorithms,
                                               {"+Sweep": pg.Sweep})
algorithms = pg.create_variations(algorithms, {"": pg.Normalize})

measure = pg.Conductance()
for algorithm_name, algorithm in algorithms.items():
    scores = algorithm(graph,
                       personalization)  # returns a dict-like pg.GraphSignal
    print(algorithm_name, measure(scores))