def find_live_objects(existing_count, cat_pipe, opt): prune_visited_trees = True # In case we want a command line option later pack_dir = git.repo('objects/pack') ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir) os.close(ffd) # FIXME: allow selection of k? # FIXME: support ephemeral bloom filters (i.e. *never* written to disk) live_objs = bloom.create(bloom_filename, expected=existing_count, k=None) stop_at, trees_visited = None, None if prune_visited_trees: trees_visited = set() stop_at = lambda (x): x.decode('hex') in trees_visited approx_live_count = 0 for ref_name, ref_id in git.list_refs(): for item in walk_object(cat_pipe, ref_id.encode('hex'), stop_at=stop_at, include_data=None): # FIXME: batch ids if opt.verbose: report_live_item(approx_live_count, existing_count, ref_name, ref_id, item) bin_id = item.id.decode('hex') if trees_visited is not None and item.type == 'tree': trees_visited.add(bin_id) if opt.verbose: if not live_objs.exists(bin_id): live_objs.add(bin_id) approx_live_count += 1 else: live_objs.add(bin_id) trees_visited = None if opt.verbose: log('expecting to retain about %.2f%% unnecessary objects\n' % live_objs.pfalse_positive()) return live_objs
def find_live_objects(repo, existing_count, cat_pipe, verbosity=0): prune_visited_trees = True # In case we want a command line option later pack_dir = repo.packdir() ffd, bloom_filename = tempfile.mkstemp(b'.bloom', b'tmp-gc-', pack_dir) os.close(ffd) # FIXME: allow selection of k? # FIXME: support ephemeral bloom filters (i.e. *never* written to disk) live_objs = bloom.create(bloom_filename, expected=existing_count, k=None) # live_objs will hold on to the fd until close or exit os.unlink(bloom_filename) stop_at, trees_visited = None, None if prune_visited_trees: trees_visited = set() stop_at = lambda x: unhexlify(x) in trees_visited approx_live_count = 0 for ref_name, ref_id in repo.refs(): for item in walk_object(cat_pipe.get, hexlify(ref_id), stop_at=stop_at, include_data=None): # FIXME: batch ids if verbosity: report_live_item(approx_live_count, existing_count, ref_name, ref_id, item, verbosity) if trees_visited is not None and item.type == b'tree': trees_visited.add(item.oid) if verbosity: if not live_objs.exists(item.oid): live_objs.add(item.oid) approx_live_count += 1 else: live_objs.add(item.oid) trees_visited = None if verbosity: log('expecting to retain about %.2f%% unnecessary objects\n' % live_objs.pfalse_positive()) return live_objs
def find_live_objects(existing_count, cat_pipe, opt): prune_visited_trees = True # In case we want a command line option later pack_dir = git.repo('objects/pack') ffd, bloom_filename = tempfile.mkstemp('.bloom', 'tmp-gc-', pack_dir) os.close(ffd) # FIXME: allow selection of k? # FIXME: support ephemeral bloom filters (i.e. *never* written to disk) live_objs = bloom.create(bloom_filename, expected=existing_count, k=None) stop_at, trees_visited = None, None if prune_visited_trees: trees_visited = set() stop_at = lambda (x): x.decode('hex') in trees_visited approx_live_count = 0 for ref_name, ref_id in git.list_refs(): for item in walk_object(cat_pipe, ref_id.encode('hex'), stop_at=stop_at, include_data=None): # FIXME: batch ids if opt.verbose: report_live_item(approx_live_count, existing_count, ref_name, ref_id, item) bin_id = item.id.decode('hex') if trees_visited is not None and item.type == 'tree': trees_visited.add(bin_id) if opt.verbose: if not live_objs.exists(bin_id): live_objs.add(bin_id) approx_live_count += 1 else: live_objs.add(bin_id) trees_visited = None if opt.verbose: log('expecting to retain about %.2f%% unnecessary objects\n' % live_objs.pfalse_positive()) return live_objs
def get_random_item(name, hash, repo, writer, opt): def already_seen(oid): return writer.exists(unhexlify(oid)) for item in walk_object(repo.cat, hash, stop_at=already_seen, include_data=True): # already_seen ensures that writer.exists(id) is false. # Otherwise, just_write() would fail. writer.just_write(item.oid, item.type, item.data)
def get_random_item(name, hash, repo, dest_repo, opt): def already_seen(oid): return dest_repo.exists(unhexlify(oid)) for item in walk_object(repo.cat, hash, stop_at=already_seen, include_data=True): # already_seen ensures that dest_repo.exists(id) is false. # Otherwise, just_write() would fail. metadata = False if item.type in (b'tree', b'commit'): metadata = True elif (item.type == b'blob' and ((item.mode is not None and stat.S_ISLNK(item.mode)) or (item.path and item.path[-1] == b'.bupm'))): metadata = True dest_repo.just_write(item.oid, item.type, item.data, metadata=metadata)