def check_midx(name): nicename = git.repo_rel(name) log('Checking %s.\n' % path_msg(nicename)) try: ix = git.open_idx(name) except git.GitError as e: add_error('%s: %s' % (path_msg(name), e)) return for count,subname in enumerate(ix.idxnames): sub = git.open_idx(os.path.join(os.path.dirname(name), subname)) for ecount,e in enumerate(sub): if not (ecount % 1234): qprogress(' %d/%d: %s %d/%d\r' % (count, len(ix.idxnames), git.shorten_hash(subname).decode('ascii'), ecount, len(sub))) if not sub.exists(e): add_error("%s: %s: %s missing from idx" % (path_msg(nicename), git.shorten_hash(subname).decode('ascii'), hexstr(e))) if not ix.exists(e): add_error("%s: %s: %s missing from midx" % (path_msg(nicename), git.shorten_hash(subname).decode('ascii'), hexstr(e))) prev = None for ecount,e in enumerate(ix): if not (ecount % 1234): qprogress(' Ordering: %d/%d\r' % (ecount, len(ix))) if e and prev and not e >= prev: add_error('%s: ordering error: %s < %s' % (nicename, hexstr(e), hexstr(prev))) prev = e
def check_midx(name): nicename = git.repo_rel(name) log('Checking %s.\n' % nicename) try: ix = git.open_idx(name) except git.GitError as e: add_error('%s: %s' % (name, e)) return for count,subname in enumerate(ix.idxnames): sub = git.open_idx(os.path.join(os.path.dirname(name), subname)) for ecount,e in enumerate(sub): if not (ecount % 1234): qprogress(' %d/%d: %s %d/%d\r' % (count, len(ix.idxnames), git.shorten_hash(subname), ecount, len(sub))) if not sub.exists(e): add_error("%s: %s: %s missing from idx" % (nicename, git.shorten_hash(subname), str(e).encode('hex'))) if not ix.exists(e): add_error("%s: %s: %s missing from midx" % (nicename, git.shorten_hash(subname), str(e).encode('hex'))) prev = None for ecount,e in enumerate(ix): if not (ecount % 1234): qprogress(' Ordering: %d/%d\r' % (ecount, len(ix))) if not e >= prev: add_error('%s: ordering error: %s < %s' % (nicename, str(e).encode('hex'), str(prev).encode('hex'))) prev = e
def ruin_bloom(bloomfilename): rbloomfilename = git.repo_rel(bloomfilename) if not os.path.exists(bloomfilename): log(path_msg(bloomfilename) + '\n') add_error('bloom: %s not found to ruin\n' % path_msg(rbloomfilename)) return b = bloom.ShaBloom(bloomfilename, readwrite=True, expected=1) b.map[16:16 + 2**b.bits] = b'\0' * 2**b.bits
def ruin_bloom(bloomfilename): rbloomfilename = git.repo_rel(bloomfilename) if not os.path.exists(bloomfilename): log("%s\n" % bloomfilename) add_error("bloom: %s not found to ruin\n" % rbloomfilename) return b = bloom.ShaBloom(bloomfilename, readwrite=True, expected=1) b.map[16:16+2**b.bits] = '\0' * 2**b.bits
def check_midx(name): nicename = git.repo_rel(name) log('Checking %s.\n' % nicename) try: ix = git.open_idx(name) except git.GitError, e: add_error('%s: %s' % (name, e)) return
def check_bloom(path, bloomfilename, idx): rbloomfilename = git.repo_rel(bloomfilename) ridx = git.repo_rel(idx) if not os.path.exists(bloomfilename): log('bloom: %s: does not exist.\n' % path_msg(rbloomfilename)) return b = bloom.ShaBloom(bloomfilename) if not b.valid(): add_error('bloom: %r is invalid.\n' % path_msg(rbloomfilename)) return base = os.path.basename(idx) if base not in b.idxnames: log('bloom: %s does not contain the idx.\n' % path_msg(rbloomfilename)) return if base == idx: idx = os.path.join(path, idx) log('bloom: bloom file: %s\n' % path_msg(rbloomfilename)) log('bloom: checking %s\n' % path_msg(ridx)) for objsha in git.open_idx(idx): if not b.exists(objsha): add_error('bloom: ERROR: object %s missing' % hexstr(objsha))
def check_bloom(path, bloomfilename, idx): rbloomfilename = git.repo_rel(bloomfilename) ridx = git.repo_rel(idx) if not os.path.exists(bloomfilename): log("bloom: %s: does not exist.\n" % rbloomfilename) return b = bloom.ShaBloom(bloomfilename) if not b.valid(): add_error("bloom: %r is invalid.\n" % rbloomfilename) return base = os.path.basename(idx) if base not in b.idxnames: log("bloom: %s does not contain the idx.\n" % rbloomfilename) return if base == idx: idx = os.path.join(path, idx) log("bloom: bloom file: %s\n" % rbloomfilename) log("bloom: checking %s\n" % ridx) for objsha in git.open_idx(idx): if not b.exists(objsha): add_error("bloom: ERROR: object %s missing" % str(objsha).encode("hex"))
def _do_midx(outdir, outfilename, infilenames, prefixstr): global _first if not outfilename: assert(outdir) sum = Sha1('\0'.join(infilenames)).hexdigest() outfilename = '%s/midx-%s.midx' % (outdir, sum) inp = [] total = 0 allfilenames = [] for name in infilenames: ix = git.open_idx(name) inp.append(( ix.map, len(ix), ix.sha_ofs, isinstance(ix, midx.PackMidx) and ix.which_ofs or 0, len(allfilenames), )) for n in ix.idxnames: allfilenames.append(os.path.basename(n)) total += len(ix) inp.sort(lambda x,y: cmp(str(y[0][y[2]:y[2]+20]),str(x[0][x[2]:x[2]+20]))) if not _first: _first = outdir dirprefix = (_first != outdir) and git.repo_rel(outdir)+': ' or '' log('midx: %s%screating from %d files (%d objects).\n' % (dirprefix, prefixstr, len(infilenames), total)) if (opt.auto and (total < 1024 and len(infilenames) < 3)) \ or ((opt.auto or opt.force) and len(infilenames) < 2) \ or (opt.force and not total): debug1('midx: nothing to do.\n') return pages = int(total/SHA_PER_PAGE) or 1 bits = int(math.ceil(math.log(pages, 2))) entries = 2**bits debug1('midx: table size: %d (%d bits)\n' % (entries*4, bits)) unlink(outfilename) f = open(outfilename + '.tmp', 'w+b') f.write('MIDX') f.write(struct.pack('!II', midx.MIDX_VERSION, bits)) assert(f.tell() == 12) f.truncate(12 + 4*entries + 20*total + 4*total) fmap = mmap_readwrite(f, close=False) count = merge_into(fmap, bits, total, inp) del fmap f.seek(0, git.SEEK_END) f.write('\0'.join(allfilenames)) f.close() os.rename(outfilename + '.tmp', outfilename) # this is just for testing if 0: p = midx.PackMidx(outfilename) assert(len(p.idxnames) == len(infilenames)) print p.idxnames assert(len(p) == total) for pe, e in p, git.idxmerge(inp, final_progress=False): assert(i == pi.next()) assert(p.exists(i)) return total, outfilename
def sweep(live_objects, existing_count, cat_pipe, threshold, compression, verbosity): # Traverse all the packs, saving the (probably) live data. ns = Nonlocal() ns.stale_files = [] def remove_stale_files(new_pack_prefix): if verbosity and new_pack_prefix: log('created ' + basename(new_pack_prefix) + '\n') for p in ns.stale_files: if new_pack_prefix and p.startswith(new_pack_prefix): continue # Don't remove the new pack file if verbosity: log('removing ' + basename(p) + '\n') os.unlink(p) if ns.stale_files: # So git cat-pipe will close them cat_pipe.restart() ns.stale_files = [] writer = git.PackWriter(objcache_maker=None, compression_level=compression, run_midx=False, on_pack_finish=remove_stale_files) # FIXME: sanity check .idx names vs .pack names? collect_count = 0 for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')): if verbosity: qprogress('preserving live data (%d%% complete)\r' % ((float(collect_count) / existing_count) * 100)) idx = git.open_idx(idx_name) idx_live_count = 0 for i in xrange(0, len(idx)): sha = idx.shatable[i * 20 : (i + 1) * 20] if live_objects.exists(sha): idx_live_count += 1 collect_count += idx_live_count if idx_live_count == 0: if verbosity: log('deleting %s\n' % git.repo_rel(basename(idx_name))) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + 'pack') continue live_frac = idx_live_count / float(len(idx)) if live_frac > ((100 - threshold) / 100.0): if verbosity: log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)), live_frac * 100)) continue if verbosity: log('rewriting %s (%.2f%% live)\n' % (basename(idx_name), live_frac * 100)) for i in xrange(0, len(idx)): sha = idx.shatable[i * 20 : (i + 1) * 20] if live_objects.exists(sha): item_it = cat_pipe.get(sha.encode('hex')) type = item_it.next() writer.just_write(sha, type, ''.join(item_it)) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + 'pack') if verbosity: progress('preserving live data (%d%% complete)\n' % ((float(collect_count) / existing_count) * 100)) # Nothing should have recreated midx/bloom yet. pack_dir = git.repo('objects/pack') assert(not os.path.exists(os.path.join(pack_dir, 'bup.bloom'))) assert(not glob.glob(os.path.join(pack_dir, '*.midx'))) # try/catch should call writer.abort()? # This will finally run midx. writer.close() # Can only change refs (if needed) after this. remove_stale_files(None) # In case we didn't write to the writer. if verbosity: log('discarded %d%% of objects\n' % ((existing_count - count_objects(pack_dir, verbosity)) / float(existing_count) * 100))
def _do_midx(outdir, outfilename, infilenames, prefixstr, auto=False, force=False): global _first if not outfilename: assert (outdir) sum = hexlify(Sha1(b'\0'.join(infilenames)).digest()) outfilename = b'%s/midx-%s.midx' % (outdir, sum) inp = [] total = 0 allfilenames = [] with ExitStack() as contexts: for name in infilenames: ix = git.open_idx(name) contexts.enter_context(ix) inp.append(( ix.map, len(ix), ix.sha_ofs, isinstance(ix, midx.PackMidx) and ix.which_ofs or 0, len(allfilenames), )) for n in ix.idxnames: allfilenames.append(os.path.basename(n)) total += len(ix) inp.sort(reverse=True, key=lambda x: x[0][x[2]:x[2] + 20]) if not _first: _first = outdir dirprefix = (_first != outdir) and git.repo_rel(outdir) + b': ' or b'' debug1('midx: %s%screating from %d files (%d objects).\n' % (dirprefix, prefixstr, len(infilenames), total)) if (auto and (total < 1024 and len(infilenames) < 3)) \ or ((auto or force) and len(infilenames) < 2) \ or (force and not total): debug1('midx: nothing to do.\n') return None pages = int(total / SHA_PER_PAGE) or 1 bits = int(math.ceil(math.log(pages, 2))) entries = 2**bits debug1('midx: table size: %d (%d bits)\n' % (entries * 4, bits)) unlink(outfilename) with atomically_replaced_file(outfilename, 'w+b') as f: f.write(b'MIDX') f.write(struct.pack('!II', midx.MIDX_VERSION, bits)) assert (f.tell() == 12) f.truncate(12 + 4 * entries + 20 * total + 4 * total) f.flush() fdatasync(f.fileno()) with mmap_readwrite(f, close=False) as fmap: count = merge_into(fmap, bits, total, inp) f.seek(0, os.SEEK_END) f.write(b'\0'.join(allfilenames)) # This is just for testing (if you enable this, don't clear inp above) # if 0: # p = midx.PackMidx(outfilename) # assert(len(p.idxnames) == len(infilenames)) # log(repr(p.idxnames) + '\n') # assert(len(p) == total) # for pe, e in p, git.idxmerge(inp, final_progress=False): # pin = next(pi) # assert(i == pin) # assert(p.exists(i)) return total, outfilename
def sweep(live_objects, existing_count, cat_pipe, opt): # Traverse all the packs, saving the (probably) live data. ns = Nonlocal() ns.stale_files = [] def remove_stale_files(new_pack_prefix): if opt.verbose and new_pack_prefix: log('created ' + basename(new_pack_prefix) + '\n') for p in ns.stale_files: if opt.verbose: log('removing ' + basename(p) + '\n') os.unlink(p) ns.stale_files = [] writer = git.PackWriter(objcache_maker=None, compression_level=opt.compress, run_midx=False, on_pack_finish=remove_stale_files) # FIXME: sanity check .idx names vs .pack names? collect_count = 0 for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')): if opt.verbose: qprogress('preserving live data (%d%% complete)\r' % ((float(collect_count) / existing_count) * 100)) idx = git.open_idx(idx_name) idx_live_count = 0 for i in xrange(0, len(idx)): sha = idx.shatable[i * 20:(i + 1) * 20] if live_objects.exists(sha): idx_live_count += 1 collect_count += idx_live_count if idx_live_count == 0: if opt.verbose: log('deleting %s\n' % git.repo_rel(basename(idx_name))) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + 'pack') continue live_frac = idx_live_count / float(len(idx)) if live_frac > ((100 - opt.threshold) / 100.0): if opt.verbose: log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)), live_frac * 100)) continue if opt.verbose: log('rewriting %s (%.2f%% live)\n' % (basename(idx_name), live_frac * 100)) for i in xrange(0, len(idx)): sha = idx.shatable[i * 20:(i + 1) * 20] if live_objects.exists(sha): item_it = cat_pipe.get(sha.encode('hex')) type = item_it.next() writer.write(sha, type, ''.join(item_it)) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + 'pack') if opt.verbose: progress('preserving live data (%d%% complete)\n' % ((float(collect_count) / existing_count) * 100)) # Nothing should have recreated midx/bloom yet. pack_dir = git.repo('objects/pack') assert (not os.path.exists(os.path.join(pack_dir, 'bup.bloom'))) assert (not glob.glob(os.path.join(pack_dir, '*.midx'))) # try/catch should call writer.abort()? # This will finally run midx. writer.close() # Can only change refs (if needed) after this. remove_stale_files(None) # In case we didn't write to the writer. if opt.verbose: log('discarded %d%% of objects\n' % ((existing_count - count_objects(pack_dir)) / float(existing_count) * 100))
def do_bloom(path, outfilename): global _first b = None if os.path.exists(outfilename) and not opt.force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i,name in enumerate(glob.glob('%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) total = add_count + rest_count if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path)+': ' or '' progress('bloom: %s%s %d file%s (%d object%s).\n' % (dirprefix, msg, len(add), len(add)!=1 and 's' or '', add_count, add_count!=1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, 'bup.tmp.bloom') b = bloom.create(tfname, expected=add_count, k=opt.k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount*100.0/add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) # Currently, there's an open file object for tfname inside b. # Make sure it's closed before rename. b.close() if tfname: os.rename(tfname, outfilename)
def do_bloom(path, outfilename): global _first b = None if os.path.exists(outfilename) and not opt.force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i, name in enumerate(glob.glob('%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) total = add_count + rest_count if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path) + ': ' or '' progress('bloom: %s%s %d file%s (%d object%s).\n' % (dirprefix, msg, len(add), len(add) != 1 and 's' or '', add_count, add_count != 1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, 'bup.tmp.bloom') tf = open(tfname, 'w+') b = bloom.create(tfname, f=tf, expected=add_count, k=opt.k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount * 100.0 / add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) if tfname: os.rename(tfname, outfilename)
def sweep(live_objects, existing_count, cat_pipe, threshold, compression, verbosity): # Traverse all the packs, saving the (probably) live data. ns = Nonlocal() ns.stale_files = [] def remove_stale_files(new_pack_prefix): if verbosity and new_pack_prefix: log('created ' + path_msg(basename(new_pack_prefix)) + '\n') for p in ns.stale_files: if new_pack_prefix and p.startswith(new_pack_prefix): continue # Don't remove the new pack file if verbosity: log('removing ' + path_msg(basename(p)) + '\n') os.unlink(p) if ns.stale_files: # So git cat-pipe will close them cat_pipe.restart() ns.stale_files = [] writer = git.PackWriter(objcache_maker=None, compression_level=compression, run_midx=False, on_pack_finish=remove_stale_files) try: # FIXME: sanity check .idx names vs .pack names? collect_count = 0 for idx_name in glob.glob( os.path.join(git.repo(b'objects/pack'), b'*.idx')): if verbosity: qprogress('preserving live data (%d%% complete)\r' % ((float(collect_count) / existing_count) * 100)) with git.open_idx(idx_name) as idx: idx_live_count = 0 for sha in idx: if live_objects.exists(sha): idx_live_count += 1 collect_count += idx_live_count if idx_live_count == 0: if verbosity: log('deleting %s\n' % path_msg(git.repo_rel(basename(idx_name)))) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + b'pack') continue live_frac = idx_live_count / float(len(idx)) if live_frac > ((100 - threshold) / 100.0): if verbosity: log('keeping %s (%d%% live)\n' % (git.repo_rel( basename(idx_name)), live_frac * 100)) continue if verbosity: log('rewriting %s (%.2f%% live)\n' % (basename(idx_name), live_frac * 100)) for sha in idx: if live_objects.exists(sha): item_it = cat_pipe.get(hexlify(sha)) _, typ, _ = next(item_it) writer.just_write(sha, typ, b''.join(item_it)) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + b'pack') if verbosity: progress('preserving live data (%d%% complete)\n' % ((float(collect_count) / existing_count) * 100)) # Nothing should have recreated midx/bloom yet. pack_dir = git.repo(b'objects/pack') assert (not os.path.exists(os.path.join(pack_dir, b'bup.bloom'))) assert (not glob.glob(os.path.join(pack_dir, b'*.midx'))) except BaseException as ex: with pending_raise(ex): writer.abort() # This will finally run midx. # Can only change refs (if needed) after this. writer.close() remove_stale_files(None) # In case we didn't write to the writer. if verbosity: log('discarded %d%% of objects\n' % ((existing_count - count_objects(pack_dir, verbosity)) / float(existing_count) * 100))
def _do_midx(outdir, outfilename, infilenames, prefixstr): global _first if not outfilename: assert(outdir) sum = Sha1('\0'.join(infilenames)).hexdigest() outfilename = '%s/midx-%s.midx' % (outdir, sum) inp = [] total = 0 allfilenames = [] midxs = [] try: for name in infilenames: ix = git.open_idx(name) midxs.append(ix) inp.append(( ix.map, len(ix), ix.sha_ofs, isinstance(ix, midx.PackMidx) and ix.which_ofs or 0, len(allfilenames), )) for n in ix.idxnames: allfilenames.append(os.path.basename(n)) total += len(ix) inp.sort(reverse=True, key=lambda x: str(x[0][x[2]:x[2]+20])) if not _first: _first = outdir dirprefix = (_first != outdir) and git.repo_rel(outdir)+': ' or '' debug1('midx: %s%screating from %d files (%d objects).\n' % (dirprefix, prefixstr, len(infilenames), total)) if (opt.auto and (total < 1024 and len(infilenames) < 3)) \ or ((opt.auto or opt.force) and len(infilenames) < 2) \ or (opt.force and not total): debug1('midx: nothing to do.\n') return pages = int(total/SHA_PER_PAGE) or 1 bits = int(math.ceil(math.log(pages, 2))) entries = 2**bits debug1('midx: table size: %d (%d bits)\n' % (entries*4, bits)) unlink(outfilename) with atomically_replaced_file(outfilename, 'wb') as f: f.write('MIDX') f.write(struct.pack('!II', midx.MIDX_VERSION, bits)) assert(f.tell() == 12) f.truncate(12 + 4*entries + 20*total + 4*total) f.flush() fdatasync(f.fileno()) fmap = mmap_readwrite(f, close=False) count = merge_into(fmap, bits, total, inp) del fmap # Assume this calls msync() now. f.seek(0, os.SEEK_END) f.write('\0'.join(allfilenames)) finally: for ix in midxs: if isinstance(ix, midx.PackMidx): ix.close() midxs = None inp = None # This is just for testing (if you enable this, don't clear inp above) if 0: p = midx.PackMidx(outfilename) assert(len(p.idxnames) == len(infilenames)) print p.idxnames assert(len(p) == total) for pe, e in p, git.idxmerge(inp, final_progress=False): pin = next(pi) assert(i == pin) assert(p.exists(i)) return total, outfilename
def _do_midx(outdir, outfilename, infilenames, prefixstr): global _first if not outfilename: assert (outdir) sum = Sha1('\0'.join(infilenames)).hexdigest() outfilename = '%s/midx-%s.midx' % (outdir, sum) inp = [] total = 0 allfilenames = [] midxs = [] try: for name in infilenames: ix = git.open_idx(name) midxs.append(ix) inp.append(( ix.map, len(ix), ix.sha_ofs, isinstance(ix, midx.PackMidx) and ix.which_ofs or 0, len(allfilenames), )) for n in ix.idxnames: allfilenames.append(os.path.basename(n)) total += len(ix) inp.sort(lambda x, y: cmp(str(y[0][y[2]:y[2] + 20]), str(x[0][x[2]:x[2] + 20]))) if not _first: _first = outdir dirprefix = (_first != outdir) and git.repo_rel(outdir) + ': ' or '' debug1('midx: %s%screating from %d files (%d objects).\n' % (dirprefix, prefixstr, len(infilenames), total)) if (opt.auto and (total < 1024 and len(infilenames) < 3)) \ or ((opt.auto or opt.force) and len(infilenames) < 2) \ or (opt.force and not total): debug1('midx: nothing to do.\n') return pages = int(total / SHA_PER_PAGE) or 1 bits = int(math.ceil(math.log(pages, 2))) entries = 2**bits debug1('midx: table size: %d (%d bits)\n' % (entries * 4, bits)) unlink(outfilename) f = open(outfilename + '.tmp', 'w+b') f.write('MIDX') f.write(struct.pack('!II', midx.MIDX_VERSION, bits)) assert (f.tell() == 12) f.truncate(12 + 4 * entries + 20 * total + 4 * total) f.flush() fdatasync(f.fileno()) fmap = mmap_readwrite(f, close=False) count = merge_into(fmap, bits, total, inp) del fmap # Assume this calls msync() now. finally: for ix in midxs: if isinstance(ix, midx.PackMidx): ix.close() midxs = None inp = None f.seek(0, os.SEEK_END) f.write('\0'.join(allfilenames)) f.close() os.rename(outfilename + '.tmp', outfilename) # This is just for testing (if you enable this, don't clear inp above) if 0: p = midx.PackMidx(outfilename) assert (len(p.idxnames) == len(infilenames)) print p.idxnames assert (len(p) == total) for pe, e in p, git.idxmerge(inp, final_progress=False): pin = pi.next() assert (i == pin) assert (p.exists(i)) return total, outfilename
def do_bloom(path, outfilename, k, force): global _first assert k in (None, 4, 5) b = None if os.path.exists(outfilename) and not force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i, name in enumerate(glob.glob(b'%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif k is not None and k != b.k: debug1("bloom: new k %d != existing k %d, regenerating\n" % (k, b.k)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS[b.k] and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path) + b': ' or b'' progress('bloom: %s%s %d file%s (%d object%s).\r' % (path_msg(dirprefix), msg, len(add), len(add) != 1 and 's' or '', add_count, add_count != 1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, b'bup.tmp.bloom') b = bloom.create(tfname, expected=add_count, k=k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount * 100.0 / add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) # Currently, there's an open file object for tfname inside b. # Make sure it's closed before rename. b.close() if tfname: os.rename(tfname, outfilename)