def test_bloom(): hashes = [os.urandom(20) for i in range(100)] class Idx: pass ix = Idx() ix.name='dummy.idx' ix.shatable = ''.join(hashes) for k in (4, 5): b = bloom.create('pybuptest.bloom', expected=100, k=k) b.add_idx(ix) WVPASSLT(b.pfalse_positive(), .1) b.close() b = bloom.ShaBloom('pybuptest.bloom') all_present = True for h in hashes: all_present &= b.exists(h) WVPASS(all_present) false_positives = 0 for h in [os.urandom(20) for i in range(1000)]: if b.exists(h): false_positives += 1 WVPASSLT(false_positives, 5) os.unlink('pybuptest.bloom') tf = tempfile.TemporaryFile() b = bloom.create('bup.bloom', f=tf, expected=100) WVPASSEQ(b.rwfile, tf) WVPASSEQ(b.k, 5) tf = tempfile.TemporaryFile() b = bloom.create('bup.bloom', f=tf, expected=2**28, delaywrite=False) WVPASSEQ(b.k, 4)
def test_bloom(): hashes = [os.urandom(20) for i in range(100)] class Idx: pass ix = Idx() ix.name = 'dummy.idx' ix.shatable = ''.join(hashes) for k in (4, 5): b = bloom.create('pybuptest.bloom', expected=100, k=k) b.add_idx(ix) WVPASSLT(b.pfalse_positive(), .1) b.close() b = bloom.ShaBloom('pybuptest.bloom') all_present = True for h in hashes: all_present &= b.exists(h) WVPASS(all_present) false_positives = 0 for h in [os.urandom(20) for i in range(1000)]: if b.exists(h): false_positives += 1 WVPASSLT(false_positives, 5) os.unlink('pybuptest.bloom') tf = tempfile.TemporaryFile() b = bloom.create('bup.bloom', f=tf, expected=100) WVPASSEQ(b.rwfile, tf) WVPASSEQ(b.k, 5) tf = tempfile.TemporaryFile() b = bloom.create('bup.bloom', f=tf, expected=2**28, delaywrite=False) WVPASSEQ(b.k, 4)
def test_bloom(): with no_lingering_errors(): with test_tempdir('bup-tbloom-') as tmpdir: hashes = [os.urandom(20) for i in range(100)] class Idx: pass ix = Idx() ix.name = 'dummy.idx' ix.shatable = ''.join(hashes) for k in (4, 5): b = bloom.create(tmpdir + '/pybuptest.bloom', expected=100, k=k) b.add_idx(ix) WVPASSLT(b.pfalse_positive(), .1) b.close() b = bloom.ShaBloom(tmpdir + '/pybuptest.bloom') all_present = True for h in hashes: all_present &= b.exists(h) WVPASS(all_present) false_positives = 0 for h in [os.urandom(20) for i in range(1000)]: if b.exists(h): false_positives += 1 WVPASSLT(false_positives, 5) os.unlink(tmpdir + '/pybuptest.bloom') tf = tempfile.TemporaryFile(dir=tmpdir) b = bloom.create('bup.bloom', f=tf, expected=100) WVPASSEQ(b.rwfile, tf) WVPASSEQ(b.k, 5) # Test large (~1GiB) filter. This may fail on s390 (31-bit # architecture), and anywhere else where the address space is # sufficiently limited. tf = tempfile.TemporaryFile(dir=tmpdir) skip_test = False try: b = bloom.create('bup.bloom', f=tf, expected=2**28, delaywrite=False) except EnvironmentError as ex: (ptr_width, linkage) = platform.architecture() if ptr_width == '32bit' and ex.errno == errno.ENOMEM: WVMSG( 'skipping large bloom filter test (mmap probably failed) ' + str(ex)) skip_test = True else: raise if not skip_test: WVPASSEQ(b.k, 4)
def test_bloom(): initial_failures = wvfailure_count() tmpdir = tempfile.mkdtemp(dir=bup_tmp, prefix='bup-tbloom-') hashes = [os.urandom(20) for i in range(100)] class Idx: pass ix = Idx() ix.name='dummy.idx' ix.shatable = ''.join(hashes) for k in (4, 5): b = bloom.create(tmpdir + '/pybuptest.bloom', expected=100, k=k) b.add_idx(ix) WVPASSLT(b.pfalse_positive(), .1) b.close() b = bloom.ShaBloom(tmpdir + '/pybuptest.bloom') all_present = True for h in hashes: all_present &= b.exists(h) WVPASS(all_present) false_positives = 0 for h in [os.urandom(20) for i in range(1000)]: if b.exists(h): false_positives += 1 WVPASSLT(false_positives, 5) os.unlink(tmpdir + '/pybuptest.bloom') tf = tempfile.TemporaryFile() b = bloom.create('bup.bloom', f=tf, expected=100) WVPASSEQ(b.rwfile, tf) WVPASSEQ(b.k, 5) # Test large (~1GiB) filter. This may fail on s390 (31-bit # architecture), and anywhere else where the address space is # sufficiently limited. tf = tempfile.TemporaryFile() skip_test = False try: b = bloom.create('bup.bloom', f=tf, expected=2**28, delaywrite=False) except EnvironmentError as ex: (ptr_width, linkage) = platform.architecture() if ptr_width == '32bit' and ex.errno == errno.ENOMEM: WVMSG('skipping large bloom filter test (mmap probably failed) ' + str(ex)) skip_test = True else: raise if not skip_test: WVPASSEQ(b.k, 4) if wvfailure_count() == initial_failures: subprocess.call(['rm', '-rf', tmpdir])
def test_bloom(tmpdir): hashes = [os.urandom(20) for i in range(100)] class Idx: pass ix = Idx() ix.name = b'dummy.idx' ix.shatable = b''.join(hashes) for k in (4, 5): with bloom.create(tmpdir + b'/pybuptest.bloom', expected=100, k=k) as b: b.add_idx(ix) assert b.pfalse_positive() < .1 with bloom.ShaBloom(tmpdir + b'/pybuptest.bloom') as b: all_present = True for h in hashes: all_present &= (b.exists(h) or False) assert all_present false_positives = 0 for h in [os.urandom(20) for i in range(1000)]: if b.exists(h): false_positives += 1 assert false_positives < 5 os.unlink(tmpdir + b'/pybuptest.bloom') tf = tempfile.TemporaryFile(dir=tmpdir) with bloom.create(b'bup.bloom', f=tf, expected=100) as b: assert b.file == tf assert b.k == 5 # Test large (~1GiB) filter. This may fail on s390 (31-bit # architecture), and anywhere else where the address space is # sufficiently limited. tf = tempfile.TemporaryFile(dir=tmpdir) skip_test = False try: with bloom.create(b'bup.bloom', f=tf, expected=2**28, delaywrite=False) as b: assert b.k == 4 except EnvironmentError as ex: (ptr_width, linkage) = platform.architecture() if ptr_width == '32bit' and ex.errno == errno.ENOMEM: logging.getLogger().info( 'skipping large bloom filter test (mmap probably failed) ' + str(ex)) else: raise
def find_live_objects(existing_count, cat_pipe, opt): prune_visited_trees = True # In case we want a command line option later pack_dir = git.repo('objects/pack') ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir) os.close(ffd) # FIXME: allow selection of k? # FIXME: support ephemeral bloom filters (i.e. *never* written to disk) live_objs = bloom.create(bloom_filename, expected=existing_count, k=None) stop_at, trees_visited = None, None if prune_visited_trees: trees_visited = set() stop_at = lambda (x): x.decode('hex') in trees_visited approx_live_count = 0 for ref_name, ref_id in git.list_refs(): for item in walk_object(cat_pipe, ref_id.encode('hex'), stop_at=stop_at, include_data=None): # FIXME: batch ids if opt.verbose: report_live_item(approx_live_count, existing_count, ref_name, ref_id, item) bin_id = item.id.decode('hex') if trees_visited is not None and item.type == 'tree': trees_visited.add(bin_id) if opt.verbose: if not live_objs.exists(bin_id): live_objs.add(bin_id) approx_live_count += 1 else: live_objs.add(bin_id) trees_visited = None if opt.verbose: log('expecting to retain about %.2f%% unnecessary objects\n' % live_objs.pfalse_positive()) return live_objs
def find_live_objects(repo, existing_count, cat_pipe, verbosity=0): prune_visited_trees = True # In case we want a command line option later pack_dir = repo.packdir() ffd, bloom_filename = tempfile.mkstemp(b'.bloom', b'tmp-gc-', pack_dir) os.close(ffd) # FIXME: allow selection of k? # FIXME: support ephemeral bloom filters (i.e. *never* written to disk) live_objs = bloom.create(bloom_filename, expected=existing_count, k=None) # live_objs will hold on to the fd until close or exit os.unlink(bloom_filename) stop_at, trees_visited = None, None if prune_visited_trees: trees_visited = set() stop_at = lambda x: unhexlify(x) in trees_visited approx_live_count = 0 for ref_name, ref_id in repo.refs(): for item in walk_object(cat_pipe.get, hexlify(ref_id), stop_at=stop_at, include_data=None): # FIXME: batch ids if verbosity: report_live_item(approx_live_count, existing_count, ref_name, ref_id, item, verbosity) if trees_visited is not None and item.type == b'tree': trees_visited.add(item.oid) if verbosity: if not live_objs.exists(item.oid): live_objs.add(item.oid) approx_live_count += 1 else: live_objs.add(item.oid) trees_visited = None if verbosity: log('expecting to retain about %.2f%% unnecessary objects\n' % live_objs.pfalse_positive()) return live_objs
def do_bloom(path, outfilename): global _first b = None if os.path.exists(outfilename) and not opt.force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i,name in enumerate(glob.glob('%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) total = add_count + rest_count if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path)+': ' or '' progress('bloom: %s%s %d file%s (%d object%s).\n' % (dirprefix, msg, len(add), len(add)!=1 and 's' or '', add_count, add_count!=1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, 'bup.tmp.bloom') b = bloom.create(tfname, expected=add_count, k=opt.k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount*100.0/add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) # Currently, there's an open file object for tfname inside b. # Make sure it's closed before rename. b.close() if tfname: os.rename(tfname, outfilename)
def do_bloom(path, outfilename, k, force): global _first assert k in (None, 4, 5) b = None if os.path.exists(outfilename) and not force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i, name in enumerate(glob.glob(b'%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif k is not None and k != b.k: debug1("bloom: new k %d != existing k %d, regenerating\n" % (k, b.k)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS[b.k] and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path) + b': ' or b'' progress('bloom: %s%s %d file%s (%d object%s).\r' % (path_msg(dirprefix), msg, len(add), len(add) != 1 and 's' or '', add_count, add_count != 1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, b'bup.tmp.bloom') b = bloom.create(tfname, expected=add_count, k=k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount * 100.0 / add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) # Currently, there's an open file object for tfname inside b. # Make sure it's closed before rename. b.close() if tfname: os.rename(tfname, outfilename)
def do_bloom(path, outfilename): global _first b = None if os.path.exists(outfilename) and not opt.force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i, name in enumerate(glob.glob('%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) total = add_count + rest_count if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path) + ': ' or '' progress('bloom: %s%s %d file%s (%d object%s).\n' % (dirprefix, msg, len(add), len(add) != 1 and 's' or '', add_count, add_count != 1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, 'bup.tmp.bloom') tf = open(tfname, 'w+') b = bloom.create(tfname, f=tf, expected=add_count, k=opt.k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount * 100.0 / add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) if tfname: os.rename(tfname, outfilename)