def test_jaccard(): # k == 400 => relative error of ~0.05 t1 = KMinValues(range(500), k=256) t2 = KMinValues(range(100, 500), k=256) j_kmin = t1.jaccard(t2) j_real = 4. / 5. error = t1.relative_error() assert abs(1 - j_kmin / j_real) <= error
def test_add(): t1 = KMinValues(k=1) t1.add("TEST1") assert t1.kmin != [] assert len(t1.kmin) == 1 t1.add("TEST2") assert t1.kmin != [] assert len(t1.kmin) == 1
def test_union(): t1 = KMinValues(range(10), k=5) t2 = KMinValues(range(10), k=5) t3 = KMinValues(range(20), k=5) t1.union(t2) assert set(t1.kmin) == set(t2.kmin) t1.union(t3) assert set(t1.kmin) == set(t3.kmin) t4 = KMinValues(range(40,50), k=5) t5 = t4 + t1 assert set(t1.kmin) != set(t5.kmin) assert set(t4.kmin) != set(t5.kmin)
def test_union(): t1 = KMinValues(range(10), k=5) t2 = KMinValues(range(10), k=5) t3 = KMinValues(range(20), k=5) t1.union(t2) assert set(t1.kmin) == set(t2.kmin) t1.union(t3) assert set(t1.kmin) == set(t3.kmin) t4 = KMinValues(range(40, 50), k=5) t5 = t4 + t1 assert set(t1.kmin) != set(t5.kmin) assert set(t4.kmin) != set(t5.kmin)
def test_constructor(): KMinValues(range(50)) KMinValues(range(50), k=50) KMinValues(k=10) KMinValues()
def test_update(): t1 = KMinValues(k=15) t1.update(range(10)) assert len(t1.kmin) == 10
}, { "name": "LogLog", "obj": LL(16), }, { "name": "SuperLogLog", "obj": SuperLL(16), }, { "name": "HyperLogLog", "obj": HyperLogLog(b=16), }, { "name": "KMinValues", "obj": KMinValues(k=1 << 16), }, { "name": "ScalingBloom", "obj": ScalingBloomFilter(1048576), }, ] @contextmanager def TimerBlock(name): start = time.time() t = ctypes.c_double() try: yield t finally:
import ujson as json import csv from itertools import islice from collections import (defaultdict, Counter) from countmemaybe import KMinValues import sys import os if __name__ == "__main__": try: data_path = sys.argv[1] except IndexError: data_path = "./" reddit = open(os.path.join(data_path, "reddit.json")) subreddits = defaultdict(lambda: KMinValues(k=1024)) subreddit_counts = csv.reader(open( os.path.join(data_path, "subreddit_counts.txt.sorted.desc")), delimiter=' ') top_50 = set(item[-1] for item in islice(subreddit_counts, 50)) print "Finding commenters" for item in reddit: data = json.loads(item) if data['subreddit'] in top_50: subreddits[data['subreddit']].add(data['author']) subreddits_list = subreddits.items() similarity = Counter() print "Calculating similarity" for i, (k1, v1) in enumerate(subreddits_list[:-1]):