Exemplo n.º 1
0
def test_bloom():
    hashes = [os.urandom(20) for i in range(100)]
    class Idx:
        pass
    ix = Idx()
    ix.name='dummy.idx'
    ix.shatable = ''.join(hashes)
    for k in (4, 5):
        b = bloom.create('pybuptest.bloom', expected=100, k=k)
        b.add_idx(ix)
        WVPASSLT(b.pfalse_positive(), .1)
        b.close()
        b = bloom.ShaBloom('pybuptest.bloom')
        all_present = True
        for h in hashes:
            all_present &= b.exists(h)
        WVPASS(all_present)
        false_positives = 0
        for h in [os.urandom(20) for i in range(1000)]:
            if b.exists(h):
                false_positives += 1
        WVPASSLT(false_positives, 5)
        os.unlink('pybuptest.bloom')

    tf = tempfile.TemporaryFile()
    b = bloom.create('bup.bloom', f=tf, expected=100)
    WVPASSEQ(b.rwfile, tf)
    WVPASSEQ(b.k, 5)
    tf = tempfile.TemporaryFile()
    b = bloom.create('bup.bloom', f=tf, expected=2**28, delaywrite=False)
    WVPASSEQ(b.k, 4)
Exemplo n.º 2
0
def test_bloom():
    hashes = [os.urandom(20) for i in range(100)]

    class Idx:
        pass

    ix = Idx()
    ix.name = 'dummy.idx'
    ix.shatable = ''.join(hashes)
    for k in (4, 5):
        b = bloom.create('pybuptest.bloom', expected=100, k=k)
        b.add_idx(ix)
        WVPASSLT(b.pfalse_positive(), .1)
        b.close()
        b = bloom.ShaBloom('pybuptest.bloom')
        all_present = True
        for h in hashes:
            all_present &= b.exists(h)
        WVPASS(all_present)
        false_positives = 0
        for h in [os.urandom(20) for i in range(1000)]:
            if b.exists(h):
                false_positives += 1
        WVPASSLT(false_positives, 5)
        os.unlink('pybuptest.bloom')

    tf = tempfile.TemporaryFile()
    b = bloom.create('bup.bloom', f=tf, expected=100)
    WVPASSEQ(b.rwfile, tf)
    WVPASSEQ(b.k, 5)
    tf = tempfile.TemporaryFile()
    b = bloom.create('bup.bloom', f=tf, expected=2**28, delaywrite=False)
    WVPASSEQ(b.k, 4)
Exemplo n.º 3
0
def test_bloom():
    with no_lingering_errors():
        with test_tempdir('bup-tbloom-') as tmpdir:
            hashes = [os.urandom(20) for i in range(100)]

            class Idx:
                pass

            ix = Idx()
            ix.name = 'dummy.idx'
            ix.shatable = ''.join(hashes)
            for k in (4, 5):
                b = bloom.create(tmpdir + '/pybuptest.bloom',
                                 expected=100,
                                 k=k)
                b.add_idx(ix)
                WVPASSLT(b.pfalse_positive(), .1)
                b.close()
                b = bloom.ShaBloom(tmpdir + '/pybuptest.bloom')
                all_present = True
                for h in hashes:
                    all_present &= b.exists(h)
                WVPASS(all_present)
                false_positives = 0
                for h in [os.urandom(20) for i in range(1000)]:
                    if b.exists(h):
                        false_positives += 1
                WVPASSLT(false_positives, 5)
                os.unlink(tmpdir + '/pybuptest.bloom')

            tf = tempfile.TemporaryFile(dir=tmpdir)
            b = bloom.create('bup.bloom', f=tf, expected=100)
            WVPASSEQ(b.rwfile, tf)
            WVPASSEQ(b.k, 5)

            # Test large (~1GiB) filter.  This may fail on s390 (31-bit
            # architecture), and anywhere else where the address space is
            # sufficiently limited.
            tf = tempfile.TemporaryFile(dir=tmpdir)
            skip_test = False
            try:
                b = bloom.create('bup.bloom',
                                 f=tf,
                                 expected=2**28,
                                 delaywrite=False)
            except EnvironmentError as ex:
                (ptr_width, linkage) = platform.architecture()
                if ptr_width == '32bit' and ex.errno == errno.ENOMEM:
                    WVMSG(
                        'skipping large bloom filter test (mmap probably failed) '
                        + str(ex))
                    skip_test = True
                else:
                    raise
            if not skip_test:
                WVPASSEQ(b.k, 4)
Exemplo n.º 4
0
def test_bloom():
    initial_failures = wvfailure_count()
    tmpdir = tempfile.mkdtemp(dir=bup_tmp, prefix='bup-tbloom-')
    hashes = [os.urandom(20) for i in range(100)]
    class Idx:
        pass
    ix = Idx()
    ix.name='dummy.idx'
    ix.shatable = ''.join(hashes)
    for k in (4, 5):
        b = bloom.create(tmpdir + '/pybuptest.bloom', expected=100, k=k)
        b.add_idx(ix)
        WVPASSLT(b.pfalse_positive(), .1)
        b.close()
        b = bloom.ShaBloom(tmpdir + '/pybuptest.bloom')
        all_present = True
        for h in hashes:
            all_present &= b.exists(h)
        WVPASS(all_present)
        false_positives = 0
        for h in [os.urandom(20) for i in range(1000)]:
            if b.exists(h):
                false_positives += 1
        WVPASSLT(false_positives, 5)
        os.unlink(tmpdir + '/pybuptest.bloom')

    tf = tempfile.TemporaryFile()
    b = bloom.create('bup.bloom', f=tf, expected=100)
    WVPASSEQ(b.rwfile, tf)
    WVPASSEQ(b.k, 5)

    # Test large (~1GiB) filter.  This may fail on s390 (31-bit
    # architecture), and anywhere else where the address space is
    # sufficiently limited.
    tf = tempfile.TemporaryFile()
    skip_test = False
    try:
        b = bloom.create('bup.bloom', f=tf, expected=2**28, delaywrite=False)
    except EnvironmentError as ex:
        (ptr_width, linkage) = platform.architecture()
        if ptr_width == '32bit' and ex.errno == errno.ENOMEM:
            WVMSG('skipping large bloom filter test (mmap probably failed) '
                  + str(ex))
            skip_test = True
        else:
            raise
    if not skip_test:
        WVPASSEQ(b.k, 4)
    if wvfailure_count() == initial_failures:
        subprocess.call(['rm', '-rf', tmpdir])
Exemplo n.º 5
0
def test_bloom(tmpdir):
    hashes = [os.urandom(20) for i in range(100)]

    class Idx:
        pass

    ix = Idx()
    ix.name = b'dummy.idx'
    ix.shatable = b''.join(hashes)
    for k in (4, 5):
        with bloom.create(tmpdir + b'/pybuptest.bloom', expected=100,
                          k=k) as b:
            b.add_idx(ix)
            assert b.pfalse_positive() < .1
        with bloom.ShaBloom(tmpdir + b'/pybuptest.bloom') as b:
            all_present = True
            for h in hashes:
                all_present &= (b.exists(h) or False)
            assert all_present
            false_positives = 0
            for h in [os.urandom(20) for i in range(1000)]:
                if b.exists(h):
                    false_positives += 1
            assert false_positives < 5
        os.unlink(tmpdir + b'/pybuptest.bloom')

    tf = tempfile.TemporaryFile(dir=tmpdir)
    with bloom.create(b'bup.bloom', f=tf, expected=100) as b:
        assert b.file == tf
        assert b.k == 5

    # Test large (~1GiB) filter.  This may fail on s390 (31-bit
    # architecture), and anywhere else where the address space is
    # sufficiently limited.
    tf = tempfile.TemporaryFile(dir=tmpdir)
    skip_test = False
    try:
        with bloom.create(b'bup.bloom', f=tf, expected=2**28,
                          delaywrite=False) as b:
            assert b.k == 4
    except EnvironmentError as ex:
        (ptr_width, linkage) = platform.architecture()
        if ptr_width == '32bit' and ex.errno == errno.ENOMEM:
            logging.getLogger().info(
                'skipping large bloom filter test (mmap probably failed) ' +
                str(ex))
        else:
            raise
Exemplo n.º 6
0
Arquivo: gc-cmd.py Projeto: 0xkag/bup
def find_live_objects(existing_count, cat_pipe, opt):
    prune_visited_trees = True # In case we want a command line option later
    pack_dir = git.repo('objects/pack')
    ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir)
    os.close(ffd)
    # FIXME: allow selection of k?
    # FIXME: support ephemeral bloom filters (i.e. *never* written to disk)
    live_objs = bloom.create(bloom_filename, expected=existing_count, k=None)
    stop_at, trees_visited = None, None
    if prune_visited_trees:
        trees_visited = set()
        stop_at = lambda (x): x.decode('hex') in trees_visited
    approx_live_count = 0
    for ref_name, ref_id in git.list_refs():
        for item in walk_object(cat_pipe, ref_id.encode('hex'),
                                stop_at=stop_at,
                                include_data=None):
            # FIXME: batch ids
            if opt.verbose:
                report_live_item(approx_live_count, existing_count,
                                 ref_name, ref_id, item)
            bin_id = item.id.decode('hex')
            if trees_visited is not None and item.type == 'tree':
                trees_visited.add(bin_id)
            if opt.verbose:
                if not live_objs.exists(bin_id):
                    live_objs.add(bin_id)
                    approx_live_count += 1
            else:
                live_objs.add(bin_id)
    trees_visited = None
    if opt.verbose:
        log('expecting to retain about %.2f%% unnecessary objects\n'
            % live_objs.pfalse_positive())
    return live_objs
Exemplo n.º 7
0
Arquivo: gc-cmd.py Projeto: yafey/bup
def find_live_objects(existing_count, cat_pipe, opt):
    prune_visited_trees = True # In case we want a command line option later
    pack_dir = git.repo('objects/pack')
    ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir)
    os.close(ffd)
    # FIXME: allow selection of k?
    # FIXME: support ephemeral bloom filters (i.e. *never* written to disk)
    live_objs = bloom.create(bloom_filename, expected=existing_count, k=None)
    stop_at, trees_visited = None, None
    if prune_visited_trees:
        trees_visited = set()
        stop_at = lambda (x): x.decode('hex') in trees_visited
    approx_live_count = 0
    for ref_name, ref_id in git.list_refs():
        for item in walk_object(cat_pipe, ref_id.encode('hex'),
                                stop_at=stop_at,
                                include_data=None):
            # FIXME: batch ids
            if opt.verbose:
                report_live_item(approx_live_count, existing_count,
                                 ref_name, ref_id, item)
            bin_id = item.id.decode('hex')
            if trees_visited is not None and item.type == 'tree':
                trees_visited.add(bin_id)
            if opt.verbose:
                if not live_objs.exists(bin_id):
                    live_objs.add(bin_id)
                    approx_live_count += 1
            else:
                live_objs.add(bin_id)
    trees_visited = None
    if opt.verbose:
        log('expecting to retain about %.2f%% unnecessary objects\n'
            % live_objs.pfalse_positive())
    return live_objs
Exemplo n.º 8
0
Arquivo: gc.py Projeto: jmberg/bup
def find_live_objects(repo, existing_count, cat_pipe, verbosity=0):
    prune_visited_trees = True # In case we want a command line option later
    pack_dir = repo.packdir()
    ffd, bloom_filename = tempfile.mkstemp(b'.bloom', b'tmp-gc-', pack_dir)
    os.close(ffd)
    # FIXME: allow selection of k?
    # FIXME: support ephemeral bloom filters (i.e. *never* written to disk)
    live_objs = bloom.create(bloom_filename, expected=existing_count, k=None)
    # live_objs will hold on to the fd until close or exit
    os.unlink(bloom_filename)
    stop_at, trees_visited = None, None
    if prune_visited_trees:
        trees_visited = set()
        stop_at = lambda x: unhexlify(x) in trees_visited
    approx_live_count = 0
    for ref_name, ref_id in repo.refs():
        for item in walk_object(cat_pipe.get, hexlify(ref_id), stop_at=stop_at,
                                include_data=None):
            # FIXME: batch ids
            if verbosity:
                report_live_item(approx_live_count, existing_count,
                                 ref_name, ref_id, item, verbosity)
            if trees_visited is not None and item.type == b'tree':
                trees_visited.add(item.oid)
            if verbosity:
                if not live_objs.exists(item.oid):
                    live_objs.add(item.oid)
                    approx_live_count += 1
            else:
                live_objs.add(item.oid)
    trees_visited = None
    if verbosity:
        log('expecting to retain about %.2f%% unnecessary objects\n'
            % live_objs.pfalse_positive())
    return live_objs
Exemplo n.º 9
0
def do_bloom(path, outfilename):
    global _first
    b = None
    if os.path.exists(outfilename) and not opt.force:
        b = bloom.ShaBloom(outfilename)
        if not b.valid():
            debug1("bloom: Existing invalid bloom found, regenerating.\n")
            b = None

    add = []
    rest = []
    add_count = 0
    rest_count = 0
    for i,name in enumerate(glob.glob('%s/*.idx' % path)):
        progress('bloom: counting: %d\r' % i)
        ix = git.open_idx(name)
        ixbase = os.path.basename(name)
        if b and (ixbase in b.idxnames):
            rest.append(name)
            rest_count += len(ix)
        else:
            add.append(name)
            add_count += len(ix)
    total = add_count + rest_count

    if not add:
        debug1("bloom: nothing to do.\n")
        return

    if b:
        if len(b) != rest_count:
            debug1("bloom: size %d != idx total %d, regenerating\n"
                   % (len(b), rest_count))
            b = None
        elif (b.bits < bloom.MAX_BLOOM_BITS and
              b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE):
            debug1("bloom: regenerating: adding %d entries gives "
                   "%.2f%% false positives.\n"
                   % (add_count, b.pfalse_positive(add_count)))
            b = None
        else:
            b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count)
    if not b: # Need all idxs to build from scratch
        add += rest
        add_count += rest_count
    del rest
    del rest_count

    msg = b is None and 'creating from' or 'adding'
    if not _first: _first = path
    dirprefix = (_first != path) and git.repo_rel(path)+': ' or ''
    progress('bloom: %s%s %d file%s (%d object%s).\n'
        % (dirprefix, msg,
           len(add), len(add)!=1 and 's' or '',
           add_count, add_count!=1 and 's' or ''))

    tfname = None
    if b is None:
        tfname = os.path.join(path, 'bup.tmp.bloom')
        b = bloom.create(tfname, expected=add_count, k=opt.k)
    count = 0
    icount = 0
    for name in add:
        ix = git.open_idx(name)
        qprogress('bloom: writing %.2f%% (%d/%d objects)\r' 
                  % (icount*100.0/add_count, icount, add_count))
        b.add_idx(ix)
        count += 1
        icount += len(ix)

    # Currently, there's an open file object for tfname inside b.
    # Make sure it's closed before rename.
    b.close()

    if tfname:
        os.rename(tfname, outfilename)
Exemplo n.º 10
0
Arquivo: bloom.py Projeto: jmberg/bup
def do_bloom(path, outfilename, k, force):
    global _first
    assert k in (None, 4, 5)
    b = None
    if os.path.exists(outfilename) and not force:
        b = bloom.ShaBloom(outfilename)
        if not b.valid():
            debug1("bloom: Existing invalid bloom found, regenerating.\n")
            b = None

    add = []
    rest = []
    add_count = 0
    rest_count = 0
    for i, name in enumerate(glob.glob(b'%s/*.idx' % path)):
        progress('bloom: counting: %d\r' % i)
        ix = git.open_idx(name)
        ixbase = os.path.basename(name)
        if b and (ixbase in b.idxnames):
            rest.append(name)
            rest_count += len(ix)
        else:
            add.append(name)
            add_count += len(ix)

    if not add:
        debug1("bloom: nothing to do.\n")
        return

    if b:
        if len(b) != rest_count:
            debug1("bloom: size %d != idx total %d, regenerating\n" %
                   (len(b), rest_count))
            b = None
        elif k is not None and k != b.k:
            debug1("bloom: new k %d != existing k %d, regenerating\n" %
                   (k, b.k))
            b = None
        elif (b.bits < bloom.MAX_BLOOM_BITS[b.k]
              and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE):
            debug1("bloom: regenerating: adding %d entries gives "
                   "%.2f%% false positives.\n" %
                   (add_count, b.pfalse_positive(add_count)))
            b = None
        else:
            b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count)
    if not b:  # Need all idxs to build from scratch
        add += rest
        add_count += rest_count
    del rest
    del rest_count

    msg = b is None and 'creating from' or 'adding'
    if not _first: _first = path
    dirprefix = (_first != path) and git.repo_rel(path) + b': ' or b''
    progress('bloom: %s%s %d file%s (%d object%s).\r' %
             (path_msg(dirprefix), msg, len(add), len(add) != 1 and 's'
              or '', add_count, add_count != 1 and 's' or ''))

    tfname = None
    if b is None:
        tfname = os.path.join(path, b'bup.tmp.bloom')
        b = bloom.create(tfname, expected=add_count, k=k)
    count = 0
    icount = 0
    for name in add:
        ix = git.open_idx(name)
        qprogress('bloom: writing %.2f%% (%d/%d objects)\r' %
                  (icount * 100.0 / add_count, icount, add_count))
        b.add_idx(ix)
        count += 1
        icount += len(ix)

    # Currently, there's an open file object for tfname inside b.
    # Make sure it's closed before rename.
    b.close()

    if tfname:
        os.rename(tfname, outfilename)
Exemplo n.º 11
0
def do_bloom(path, outfilename):
    global _first
    b = None
    if os.path.exists(outfilename) and not opt.force:
        b = bloom.ShaBloom(outfilename)
        if not b.valid():
            debug1("bloom: Existing invalid bloom found, regenerating.\n")
            b = None

    add = []
    rest = []
    add_count = 0
    rest_count = 0
    for i, name in enumerate(glob.glob('%s/*.idx' % path)):
        progress('bloom: counting: %d\r' % i)
        ix = git.open_idx(name)
        ixbase = os.path.basename(name)
        if b and (ixbase in b.idxnames):
            rest.append(name)
            rest_count += len(ix)
        else:
            add.append(name)
            add_count += len(ix)
    total = add_count + rest_count

    if not add:
        debug1("bloom: nothing to do.\n")
        return

    if b:
        if len(b) != rest_count:
            debug1("bloom: size %d != idx total %d, regenerating\n" %
                   (len(b), rest_count))
            b = None
        elif (b.bits < bloom.MAX_BLOOM_BITS
              and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE):
            debug1("bloom: regenerating: adding %d entries gives "
                   "%.2f%% false positives.\n" %
                   (add_count, b.pfalse_positive(add_count)))
            b = None
        else:
            b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count)
    if not b:  # Need all idxs to build from scratch
        add += rest
        add_count += rest_count
    del rest
    del rest_count

    msg = b is None and 'creating from' or 'adding'
    if not _first: _first = path
    dirprefix = (_first != path) and git.repo_rel(path) + ': ' or ''
    progress('bloom: %s%s %d file%s (%d object%s).\n' %
             (dirprefix, msg, len(add), len(add) != 1 and 's'
              or '', add_count, add_count != 1 and 's' or ''))

    tfname = None
    if b is None:
        tfname = os.path.join(path, 'bup.tmp.bloom')
        tf = open(tfname, 'w+')
        b = bloom.create(tfname, f=tf, expected=add_count, k=opt.k)
    count = 0
    icount = 0
    for name in add:
        ix = git.open_idx(name)
        qprogress('bloom: writing %.2f%% (%d/%d objects)\r' %
                  (icount * 100.0 / add_count, icount, add_count))
        b.add_idx(ix)
        count += 1
        icount += len(ix)

    if tfname:
        os.rename(tfname, outfilename)