def test_sums_detailed(self, data): it = FixedIterator(3) alg = Sums(iterator=it, priors=PriorBelief.FIXED) initial, first, second, third = alg.run_iter(data) assert initial.belief == { "x": { "one": 0.5 }, "y": { "nine": 0.5, "eight": 0.5 }, "z": { "seven": 0.5 } } assert initial.trust == {"s1": 0, "s2": 0, "s3": 0} assert first.trust == {"s1": 1, "s2": 2 / 3, "s3": 1 / 3} assert first.belief == { "x": { "one": 1 }, "y": { "nine": 3 / 5, "eight": 2 / 5 }, "z": { "seven": 4 / 5 } }
def test_empty_dataset(self): data = Dataset([]) non_it = MajorityVoting() it = Sums() for alg in [non_it, it]: with pytest.raises(EmptyDatasetError) as excinfo: alg.run(data) err_msg = "Cannot run algorithm on empty dataset" assert str(excinfo.value) == err_msg # Test with run_iter also with pytest.raises(EmptyDatasetError) as excinfo2: _l = list(it.run_iter(data)) assert str(excinfo2.value) == err_msg
def test_progress_bar(self, dataset): w = 200 rend = GraphRenderer(width=w, backend=JsonBackend()) anim = JsonAnimator(renderer=rend) buf = StringIO() it = FixedIterator(20) alg = Sums(iterator=it) anim.animate(buf, alg, dataset, show_progress=True) buf.seek(0) obj = json.load(buf) # Get the frame for the 5th iteration, which is 1 / 4 through frame = obj["frames"][5] rects = [ e for e in frame["entities"] if e["type"] == "rectangle" and e["width"] != w ] assert len(rects) == 1 assert rects[0]["x"] == 0 assert rects[0]["width"] == w / 4 # Test without progress buf2 = StringIO() anim.animate(buf2, alg, dataset, show_progress=False) buf2.seek(0) obj2 = json.load(buf2) frame2 = obj2["frames"][5] rects2 = [ e for e in frame2["entities"] if e["type"] == "rectangle" and e["width"] != w ] assert not len(rects2)
def test_num_iterations(self): data = Dataset([("source 1", "x", 7), ("source 2", "x", 8)]) voting_res = MajorityVoting().run(data) assert voting_res.iterations is None sums_res = Sums(iterator=FixedIterator(13)).run(data) assert sums_res.iterations == 13
def test_time_taken(self): """ Test run time in Result objects for iterative and non-iterative algorithms """ data = Dataset([("source 1", "x", 7), ("source 2", "x", 8)]) res = MajorityVoting().run(data) assert res.time_taken == 5 res = Sums().run(data) assert res.time_taken == 5
def test_basic(self, data): """ Perform Sums on a small graph. The expected results were obtained by finding eigenvectors of suitable matrices (using numpy "by hand"), as per Kleinberg paper for Hubs and Authorities """ sums = Sums(iterator=ConvergenceIterator(DistanceMeasures.L1, 0.00001)) results = sums.run(data) assert np.isclose(results.trust["s1"], 1) assert np.isclose(results.trust["s2"], 0.53208889) assert np.isclose(results.trust["s3"], 0.34729636) assert set(results.belief["x"].keys()) == {"one"} assert np.isclose(results.belief["x"]["one"], 1) assert set(results.belief["y"].keys()) == {"eight", "nine"} assert np.isclose(results.belief["y"]["nine"], 0.65270364) assert np.isclose(results.belief["y"]["eight"], 0.34729636) assert set(results.belief["z"].keys()) == {"seven"} assert np.isclose(results.belief["z"]["seven"], 0.87938524)
def test_belief_stats(self, csv_dataset, csv_fileobj, capsys): self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "belief_stats") results = yaml.safe_load(capsys.readouterr().out)["sums"] assert set(results.keys()) == {"belief_stats"} exp_belief_stats = (Sums().run( MatrixDataset.from_csv(csv_fileobj)).get_belief_stats()) assert results["belief_stats"] == { var: { "mean": mean, "stddev": stddev } for var, (mean, stddev) in exp_belief_stats.items() }
def test_get_output_obj(self, csv_fileobj): dataset = MatrixDataset.from_csv(csv_fileobj) alg = Sums(iterator=FixedIterator(5)) # Default should be all fields if none are given, but not accuracy # unless supervised data given results = alg.run(dataset) out1 = BaseClient().get_output_obj(results) exp_keys = { f.value for f in OutputFields if f != OutputFields.ACCURACY } assert set(out1.keys()) == exp_keys sup_data = SupervisedData.from_csv(csv_fileobj) sup_results = alg.run(sup_data.data) out2 = BaseClient().get_output_obj(sup_results, sup_data=sup_data) assert set(out2.keys()) == {f.value for f in OutputFields} assert out2["trust"] == sup_results.trust assert out2["belief"] == sup_results.belief out3 = BaseClient().get_output_obj(results, output_fields=[OutputFields.TRUST]) assert set(out3.keys()) == {"trust"}
def test_gif_animation(self, dataset): w, h = 123, 95 renderer = GraphRenderer(width=w, node_radius=10, spacing=5) animator = GifAnimator(renderer=renderer) alg = Sums() buf = BytesIO() animator.animate(buf, alg, dataset) buf.seek(0) assert is_valid_gif(buf) # Check dimensions are as expected buf.seek(0) img_data = imageio.imread(buf) got_w, got_h, _ = img_data.shape assert (got_h, got_w) == (w, h)
def test_custom_output(self, csv_fileobj, csv_dataset, capsys): self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "time") results = yaml.safe_load(capsys.readouterr().out)["sums"] assert set(results.keys()) == {"time"} self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "time", "iterations") results = yaml.safe_load(capsys.readouterr().out)["sums"] assert set(results.keys()) == {"time", "iterations"} self.run("run", "-a", "sums", "-f", csv_dataset, "-o", "trust", "trust_stats") results = yaml.safe_load(capsys.readouterr().out)["sums"] assert set(results.keys()) == {"trust", "trust_stats"} exp_mean, exp_stddev = (Sums().run( MatrixDataset.from_csv(csv_fileobj)).get_trust_stats()) assert results["trust_stats"] == { "mean": exp_mean, "stddev": exp_stddev }
def test_json_animation(self, dataset): w, h = 123, 95 renderer = GraphRenderer(width=w, node_radius=10, spacing=5, backend=JsonBackend()) animator = JsonAnimator(renderer=renderer, frame_duration=1 / 9) alg = Sums(iterator=FixedIterator(4)) buf = StringIO() animator.animate(buf, alg, dataset) buf.seek(0) obj = json.load(buf) assert "fps" in obj assert obj["fps"] == 9 assert "frames" in obj assert isinstance(obj["frames"], list) assert len(obj["frames"]) == 5 assert isinstance(obj["frames"][0], dict) assert "width" in obj["frames"][0] assert "height" in obj["frames"][0] assert "entities" in obj["frames"][0] assert obj["frames"][0]["width"] == w assert obj["frames"][0]["height"] == h
""" from collections import OrderedDict import itertools import json import sys import numpy as np import matplotlib.pyplot as plt from truthdiscovery.input import SyntheticData from truthdiscovery.algorithm import (AverageLog, Investment, MajorityVoting, PooledInvestment, Sums, TruthFinder) ALGORITHMS = OrderedDict({ "Voting": MajorityVoting(), "Sums": Sums(), "Average.Log": AverageLog(), "Investment": Investment(), "Pooled Investment": PooledInvestment(), "TruthFinder": TruthFinder() }) class Experiment: # labels for values of independent variable labels = None # dict mapping algorithm labels to objects algorithms = None # number of trials to perform for each value reps = 10 # parameters to pass to synthetic data generation. Value for independent
def test_base(self, dataset): class MyAnimator(BaseAnimator): supported_backends = (PngBackend, ) with pytest.raises(NotImplementedError): MyAnimator().animate(BytesIO(), Sums(), dataset)
def test_results_based_valid_png(self, dataset, tmpdir): cs = ResultsGradientColourScheme(Sums().run(dataset)) out = tmpdir.join("mygraph.png") GraphRenderer(backend=PngBackend(), colours=cs).render(dataset, out) with open(str(out), "rb") as f: assert is_valid_png(f)
def test_sums(self, data): sums = Sums(iterator=FixedIterator(20)) self.check_results(sums, data, "sums_results.json")
def main(): # Show usage if len(sys.argv) > 1 and sys.argv[1] in ("-h", "--help"): usage() return dataset = None sup = None # Unpickle dataset from a file if only one argument given if len(sys.argv) == 2: print("unpickling data...") start = time.time() with open(sys.argv[1], "rb") as pickle_file: sup = pickle.load(pickle_file) end = time.time() print(" unpickled in {:.3f} seconds".format(end - start)) dataset = sup.data elif len(sys.argv) == 3: data_path, truth_path = sys.argv[1:] print("loading data...") start = time.time() dataset = StockDataset(data_path) end = time.time() print(" loaded in {:.3f} seconds".format(end - start)) print("loading true values...") start = time.time() sup = SupervisedStockData(dataset, truth_path) end = time.time() print(" loaded in {:.3f} seconds".format(end - start)) pickle_path = "/tmp/stock_data.pickle" with open(pickle_path, "wb") as pickle_file: pickle.dump(sup, pickle_file) print("pickled to {}".format(pickle_path)) else: usage(sys.stderr) sys.exit(1) print("") print("dataset has {} sources, {} claims, {} variables".format( dataset.num_sources, dataset.num_claims, dataset.num_variables )) start = time.time() print("calculating connected components...") components = dataset.num_connected_components() end = time.time() print(" calculated in {:.3f} seconds: {} components".format( end - start, components )) algorithms = [ MajorityVoting(), Sums(), AverageLog(), Investment(), PooledInvestment(), TruthFinder() ] for alg in algorithms: print("running {}...".format(alg.__class__.__name__)) res = alg.run(sup.data) acc = sup.get_accuracy(res) print(" {:.3f} seconds, {:.3f} accuracy".format(res.time_taken, acc))
plain = False if len(sys.argv) == 3 and sys.argv[1] == "--plain": outpath = sys.argv[2] plain = True elif len(sys.argv) == 2: outpath = sys.argv[1] else: print("usage: {} [--plain] DEST".format(sys.argv[0]), file=sys.stderr) sys.exit(1) tuples = [ ("source 1", "x", 4), ("source 1", "y", 7), ("source 2", "y", 7), ("source 2", "z", 5), ("source 3", "x", 3), ("source 3", "z", 5), ("source 4", "x", 3), ("source 4", "y", 6), ("source 4", "z", 8), ("my really long source name", "mylongvar", "extremelylongvalue"), ] mydata = Dataset(tuples) results = Sums().run(mydata) colour_scheme = (PlainColourScheme() if plain else ResultsGradientColourScheme(results)) renderer = GraphRenderer(width=1000, colours=colour_scheme) with open(outpath, "wb") as imgfile: renderer.render(mydata, imgfile)
from truthdiscovery.input import MatrixDataset, SyntheticData from truthdiscovery.algorithm import (AverageLog, Investment, MajorityVoting, PooledInvestment, Sums, TruthFinder) # Sizes for sources/variables to use in the experiments DATA_SIZES = list(range(100, 2001, 200)) # The fixed size for the parameter that is not being varied FIXED_SIZE = 500 # Parameters for synthetic data generation CLAIM_PROBABILITY = 0.1 DOMAIN_SIZE = 4 ALGORITHMS = OrderedDict({ "voting": MajorityVoting(), "sums": Sums(), "average.log": AverageLog(), "investment": Investment(), "Pooled Investment": PooledInvestment(), "TruthFinder": TruthFinder() }) def generate_timings(): print("generating large trust vector...", file=sys.stderr) max_size = max(DATA_SIZES) trust = np.random.uniform(size=(max_size, )) print("generating large dataset...", file=sys.stderr) large_synth = SyntheticData(trust, num_variables=max_size,