示例#1
0
    def test_add(self):
        s = HyperLogLog(0.05)

        for i in range(10):
            s.add(str(i))

        M = [(i, v) for i, v in enumerate(s.M) if v > 0]

        self.assertEqual(M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4), (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
示例#2
0
 def test_pickle(self):
     a = HyperLogLog(0.05)
     for x in range(100):
         a.add(str(x))
     b = pickle.loads(pickle.dumps(a))
     self.assertEqual(a.M, b.M)
     self.assertEqual(a.alpha, b.alpha)
     self.assertEqual(a.p, b.p)
     self.assertEqual(a.m, b.m)
示例#3
0
 def test_pickle(self):
     a = HyperLogLog(0.05)
     for x in range(100):
         a.add(str(x))
     b = pickle.loads(pickle.dumps(a))
     self.assertEqual(a.M, b.M)
     self.assertEqual(a.alpha, b.alpha)
     self.assertEqual(a.p, b.p)
     self.assertEqual(a.m, b.m)
示例#4
0
    def test_add(self):
        s = HyperLogLog(0.05)

        for i in range(10):
            s.add(str(i))

        M = [(i, v) for i, v in enumerate(s.M) if v > 0]

        self.assertEqual(M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4),
                             (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
示例#5
0
    def test_calc_cardinality(self):
        clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000]
        n = 30
        rel_err = 0.05

        for card in clist:
            s = 0.0
            for c in xrange(n):
                a = HyperLogLog(rel_err)

                for i in xrange(card):
                    a.add(os.urandom(20))

                s += a.card()

            z = (float(s) / n - card) / (rel_err * card / math.sqrt(n))
            self.assertLess(-1.96, z)
            self.assertGreater(1.96, z)
示例#6
0
    def test_calc_cardinality(self):
        clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000]
        n = 30
        rel_err = 0.05

        for card in clist:
            s = 0.0
            for c in range(n):
                a = HyperLogLog(rel_err)

                for i in range(card):
                    a.add(os.urandom(20))

                s += a.card()

            z = (float(s) / n - card) / (rel_err * card / math.sqrt(n))
            self.assertLess(-3, z)
            self.assertGreater(3, z)
示例#7
0
 def test_save(self):
     a = HyperLogLog(0.05)
     for x in range(100):
         a.add(str(x))
     saved=a.save()
     b = HyperLogLog(0.05)
     b.load(saved)
     self.assertEqual(a.M, b.M)
     self.assertEqual(a.alpha, b.alpha)
     self.assertEqual(a.p, b.p)
     self.assertEqual(a.m, b.m)
示例#8
0
    def test_update(self):
        a = HyperLogLog(0.05)
        b = HyperLogLog(0.05)
        c = HyperLogLog(0.05)

        for i in xrange(2):
            a.add(str(i))
            c.add(str(i))

        for i in xrange(2, 4):
            b.add(str(i))
            c.add(str(i))

        a.update(b)

        self.assertNotEqual(a, b)
        self.assertNotEqual(b, c)
        self.assertEqual(a, c)
示例#9
0
    def test_update(self):
        a = HyperLogLog(0.05)
        b = HyperLogLog(0.05)
        c = HyperLogLog(0.05)

        for i in range(2):
            a.add(str(i))
            c.add(str(i))

        for i in range(2, 4):
            b.add(str(i))
            c.add(str(i))

        a.update(b)

        self.assertNotEqual(a, b)
        self.assertNotEqual(b, c)
        self.assertEqual(a, c)
示例#10
0
 def test_init(self):
     s = HyperLogLog(0.05)
     self.assertEqual(s.p, 9)
     self.assertEqual(s.alpha, 0.7197831133217303)
     self.assertEqual(s.m, 512)
     self.assertEqual(len(s.M), 512)
示例#11
0
    def test_update_err(self):
        a = HyperLogLog(0.05)
        b = HyperLogLog(0.01)

        self.assertRaises(ValueError, a.update, b)
示例#12
0
import khmer
import sys
from screed.fasta import fasta_iter

from hyperloglog.hll import HyperLogLog


filename = sys.argv[1]
K = int(sys.argv[2])  # size of kmer

ERROR_RATE = .01
TT = string.maketrans('ACGT', 'TGCA')

hllcpp = khmer.new_hll_counter(ERROR_RATE)
hlllib = HyperLogLog(ERROR_RATE)
counter = Counter()
counter_norc = Counter()

for n, record in enumerate(fasta_iter(open(filename))):
    sequence = record['sequence']
    seq_len = len(sequence)
    for n in range(0, seq_len + 1 - K):
        kmer = sequence[n:n + K]
        rc = kmer[::-1].translate(TT)

        hllcpp.add(kmer)
        hlllib.add(kmer)
        counter_norc.update([kmer])

        if rc in counter: