示例#1
0
def test_jaccard():
    # k == 400 => relative error of ~0.05
    t1 = KMinValues(range(500), k=256)
    t2 = KMinValues(range(100, 500), k=256)

    j_kmin = t1.jaccard(t2)
    j_real = 4. / 5.
    error = t1.relative_error()
    assert abs(1 - j_kmin / j_real) <= error
def test_jaccard():
    # k == 400 => relative error of ~0.05
    t1 = KMinValues(range(500), k=256)
    t2 = KMinValues(range(100, 500), k=256)

    j_kmin = t1.jaccard(t2)
    j_real = 4. / 5.
    error = t1.relative_error()
    assert abs(1 - j_kmin / j_real) <= error
示例#3
0
def test_add():
    t1 = KMinValues(k=1)
    t1.add("TEST1")
    assert t1.kmin != []
    assert len(t1.kmin) == 1

    t1.add("TEST2")
    assert t1.kmin != []
    assert len(t1.kmin) == 1
示例#4
0
def test_union():
    t1 = KMinValues(range(10), k=5)
    t2 = KMinValues(range(10), k=5)
    t3 = KMinValues(range(20), k=5)

    t1.union(t2)
    assert set(t1.kmin) == set(t2.kmin)

    t1.union(t3)
    assert set(t1.kmin) == set(t3.kmin)

    t4 = KMinValues(range(40,50), k=5)
    t5 = t4 + t1
    assert set(t1.kmin) != set(t5.kmin)
    assert set(t4.kmin) != set(t5.kmin)
def test_union():
    t1 = KMinValues(range(10), k=5)
    t2 = KMinValues(range(10), k=5)
    t3 = KMinValues(range(20), k=5)

    t1.union(t2)
    assert set(t1.kmin) == set(t2.kmin)

    t1.union(t3)
    assert set(t1.kmin) == set(t3.kmin)

    t4 = KMinValues(range(40, 50), k=5)
    t5 = t4 + t1
    assert set(t1.kmin) != set(t5.kmin)
    assert set(t4.kmin) != set(t5.kmin)
def test_add():
    t1 = KMinValues(k=1)
    t1.add("TEST1")
    assert t1.kmin != []
    assert len(t1.kmin) == 1

    t1.add("TEST2")
    assert t1.kmin != []
    assert len(t1.kmin) == 1
def test_constructor():
    KMinValues(range(50))
    KMinValues(range(50), k=50)
    KMinValues(k=10)
    KMinValues()
def test_update():
    t1 = KMinValues(k=15)
    t1.update(range(10))
    assert len(t1.kmin) == 10
    },
    {
        "name": "LogLog",
        "obj": LL(16),
    },
    {
        "name": "SuperLogLog",
        "obj": SuperLL(16),
    },
    {
        "name": "HyperLogLog",
        "obj": HyperLogLog(b=16),
    },
    {
        "name": "KMinValues",
        "obj": KMinValues(k=1 << 16),
    },
    {
        "name": "ScalingBloom",
        "obj": ScalingBloomFilter(1048576),
    },
]


@contextmanager
def TimerBlock(name):
    start = time.time()
    t = ctypes.c_double()
    try:
        yield t
    finally:
import ujson as json
import csv
from itertools import islice
from collections import (defaultdict, Counter)
from countmemaybe import KMinValues
import sys
import os

if __name__ == "__main__":
    try:
        data_path = sys.argv[1]
    except IndexError:
        data_path = "./"

    reddit = open(os.path.join(data_path, "reddit.json"))
    subreddits = defaultdict(lambda: KMinValues(k=1024))

    subreddit_counts = csv.reader(open(
        os.path.join(data_path, "subreddit_counts.txt.sorted.desc")),
                                  delimiter=' ')
    top_50 = set(item[-1] for item in islice(subreddit_counts, 50))
    print "Finding commenters"
    for item in reddit:
        data = json.loads(item)
        if data['subreddit'] in top_50:
            subreddits[data['subreddit']].add(data['author'])

    subreddits_list = subreddits.items()
    similarity = Counter()
    print "Calculating similarity"
    for i, (k1, v1) in enumerate(subreddits_list[:-1]):
示例#11
0
def test_update():
    t1 = KMinValues(k=15)
    t1.update(range(10))
    assert len(t1.kmin) == 10