def test_pack_writing(self): # see how fast we can write a pack from object streams. # This will not be fast, as we take time for decompressing the streams as well ostream = CountedNullStream() pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) ni = 5000 count = 0 total_size = 0 st = time() objs = list() for sha in pdb.sha_iter(): count += 1 objs.append(pdb.stream(sha)) if count == ni: break #END gather objects for pack-writing elapsed = time() - st print >> sys.stderr, "PDB Streaming: Got %i streams by sha in in %f s ( %f streams/s )" % (ni, elapsed, ni / elapsed) st = time() PackEntity.write_pack(objs, ostream.write) elapsed = time() - st total_kb = ostream.bytes_written() / 1000 print >> sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % (total_kb, elapsed, total_kb/elapsed)
def test_correctness(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # disabled for now as it used to work perfectly, checking big repositories takes a long time print("Endurance run: verify streaming of objects (crc and sha)", file=sys.stderr) for crc in range(2): count = 0 st = time() for entity in pdb.entities(): pack_verify = entity.is_valid_stream sha_by_index = entity.index().sha for index in xrange(entity.index().size()): try: assert pack_verify(sha_by_index(index), use_crc=crc) count += 1 except UnsupportedOperation: pass # END ignore old indices # END for each index # END for each entity elapsed = time() - st print( "PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % (count, crc, elapsed, count / elapsed), file=sys.stderr)
def test_pack_random_access(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # sha lookup st = time() sha_list = list(pdb.sha_iter()) elapsed = time() - st ns = len(sha_list) print >> sys.stderr, "PDB: looked up %i shas by index in %f s ( %f shas/s )" % (ns, elapsed, ns / elapsed) # sha lookup: best-case and worst case access pdb_pack_info = pdb._pack_info access_times = list() for rand in range(2): if rand: random.shuffle(sha_list) # END shuffle shas st = time() for sha in sha_list: pdb_pack_info(sha) # END for each sha to look up elapsed = time() - st access_times.append(elapsed) # discard cache del(pdb._entities) pdb.entities() print >> sys.stderr, "PDB: looked up %i sha in %i packs (random=%i) in %f s ( %f shas/s )" % (ns, len(pdb.entities()), rand, elapsed, ns / elapsed) # END for each random mode elapsed_order, elapsed_rand = access_times # well, its never really sequencial regarding the memory patterns, but it # shows how well the prioriy cache performs print >> sys.stderr, "PDB: sequential access is %f %% faster than random-access" % (100 - ((elapsed_order / elapsed_rand) * 100)) # query info and streams only max_items = 10000 # can wait longer when testing memory for pdb_fun in (pdb.info, pdb.stream): st = time() for sha in sha_list[:max_items]: pdb_fun(sha) elapsed = time() - st print >> sys.stderr, "PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % (max_items, pdb_fun.__name__.upper(), elapsed, max_items / elapsed) # END for each function # retrieve stream and read all max_items = 5000 pdb_stream = pdb.stream total_size = 0 st = time() for sha in sha_list[:max_items]: stream = pdb_stream(sha) stream.read() total_size += stream.size elapsed = time() - st total_kib = total_size / 1000 print >> sys.stderr, "PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib/elapsed , elapsed, max_items / elapsed)
def test_pack_random_access(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # sha lookup st = time() sha_list = list(pdb.sha_iter()) elapsed = time() - st ns = len(sha_list) print("PDB: looked up %i shas by index in %f s ( %f shas/s )" % (ns, elapsed, ns / elapsed), file=sys.stderr) # sha lookup: best-case and worst case access pdb_pack_info = pdb._pack_info # END shuffle shas st = time() for sha in sha_list: pdb_pack_info(sha) # END for each sha to look up elapsed = time() - st # discard cache del(pdb._entities) pdb.entities() print("PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % (ns, len(pdb.entities()), elapsed, ns / elapsed), file=sys.stderr) # END for each random mode # query info and streams only max_items = 10000 # can wait longer when testing memory for pdb_fun in (pdb.info, pdb.stream): st = time() for sha in sha_list[:max_items]: pdb_fun(sha) elapsed = time() - st print("PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % (max_items, pdb_fun.__name__.upper(), elapsed, max_items / elapsed), file=sys.stderr) # END for each function # retrieve stream and read all max_items = 5000 pdb_stream = pdb.stream total_size = 0 st = time() for sha in sha_list[:max_items]: stream = pdb_stream(sha) read_len = len(stream.read()) assert read_len == stream.size total_size += stream.size elapsed = time() - st total_kib = total_size / 1000 print("PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib / elapsed, elapsed, max_items / elapsed), file=sys.stderr)
def test_pack_writing(self): # see how fast we can write a pack from object streams. # This will not be fast, as we take time for decompressing the streams as well ostream = CountedNullStream() pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) ni = 5000 count = 0 total_size = 0 st = time() for sha in pdb.sha_iter(): count += 1 pdb.stream(sha) if count == ni: break #END gather objects for pack-writing elapsed = time() - st print >> sys.stderr, "PDB Streaming: Got %i streams by sha in in %f s ( %f streams/s )" % ( ni, elapsed, ni / elapsed) st = time() PackEntity.write_pack((pdb.stream(sha) for sha in pdb.sha_iter()), ostream.write, object_count=ni) elapsed = time() - st total_kb = ostream.bytes_written() / 1000 print >> sys.stderr, "PDB Streaming: Wrote pack of size %i kb in %f s (%f kb/s)" % ( total_kb, elapsed, total_kb / elapsed)
def test_stream_reading(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # streaming only, meant for --with-profile runs ni = 5000 count = 0 pdb_stream = pdb.stream total_size = 0 st = time() for sha in pdb.sha_iter(): if count == ni: break stream = pdb_stream(sha) stream.read() total_size += stream.size count += 1 elapsed = time() - st total_kib = total_size / 1000 print >> sys.stderr, "PDB Streaming: Got %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (ni, total_kib, total_kib/elapsed , elapsed, ni / elapsed)
def test_correctness(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # disabled for now as it used to work perfectly, checking big repositories takes a long time print >> sys.stderr, "Endurance run: verify streaming of objects (crc and sha)" for crc in range(2): count = 0 st = time() for entity in pdb.entities(): pack_verify = entity.is_valid_stream sha_by_index = entity.index().sha for index in xrange(entity.index().size()): try: assert pack_verify(sha_by_index(index), use_crc=crc) count += 1 except UnsupportedOperation: pass # END ignore old indices # END for each index # END for each entity elapsed = time() - st print >> sys.stderr, "PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % (count, crc, elapsed, count / elapsed)
def test_pack_random_access(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # sha lookup st = time() sha_list = list(pdb.sha_iter()) elapsed = time() - st ns = len(sha_list) print("PDB: looked up %i shas by index in %f s ( %f shas/s )" % (ns, elapsed, ns / elapsed), file=sys.stderr) # sha lookup: best-case and worst case access pdb_pack_info = pdb._pack_info # END shuffle shas st = time() for sha in sha_list: pdb_pack_info(sha) # END for each sha to look up elapsed = time() - st # discard cache del (pdb._entities) pdb.entities() print("PDB: looked up %i sha in %i packs in %f s ( %f shas/s )" % (ns, len(pdb.entities()), elapsed, ns / elapsed), file=sys.stderr) # END for each random mode # query info and streams only max_items = 10000 # can wait longer when testing memory for pdb_fun in (pdb.info, pdb.stream): st = time() for sha in sha_list[:max_items]: pdb_fun(sha) elapsed = time() - st print("PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % (max_items, pdb_fun.__name__.upper(), elapsed, max_items / elapsed), file=sys.stderr) # END for each function # retrieve stream and read all max_items = 5000 pdb_stream = pdb.stream total_size = 0 st = time() for sha in sha_list[:max_items]: stream = pdb_stream(sha) read_len = len(stream.read()) assert read_len == stream.size total_size += stream.size elapsed = time() - st total_kib = total_size / 1000 print( "PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % (max_items, total_kib, total_kib / elapsed, elapsed, max_items / elapsed), file=sys.stderr)
def test_pack_random_access(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # sha lookup st = time() sha_list = list(pdb.sha_iter()) elapsed = time() - st ns = len(sha_list) print >> sys.stderr, "PDB: looked up %i shas by index in %f s ( %f shas/s )" % ( ns, elapsed, ns / elapsed) # sha lookup: best-case and worst case access pdb_pack_info = pdb._pack_info access_times = list() for rand in range(2): if rand: random.shuffle(sha_list) # END shuffle shas st = time() for sha in sha_list: pdb_pack_info(sha) # END for each sha to look up elapsed = time() - st access_times.append(elapsed) # discard cache del (pdb._entities) pdb.entities() print >> sys.stderr, "PDB: looked up %i sha in %i packs (random=%i) in %f s ( %f shas/s )" % ( ns, len(pdb.entities()), rand, elapsed, ns / elapsed) # END for each random mode elapsed_order, elapsed_rand = access_times # well, its never really sequencial regarding the memory patterns, but it # shows how well the prioriy cache performs print >> sys.stderr, "PDB: sequential access is %f %% faster than random-access" % ( 100 - ((elapsed_order / elapsed_rand) * 100)) # query info and streams only max_items = 10000 # can wait longer when testing memory for pdb_fun in (pdb.info, pdb.stream): st = time() for sha in sha_list[:max_items]: pdb_fun(sha) elapsed = time() - st print >> sys.stderr, "PDB: Obtained %i object %s by sha in %f s ( %f items/s )" % ( max_items, pdb_fun.__name__.upper(), elapsed, max_items / elapsed) # END for each function # retrieve stream and read all max_items = 5000 pdb_stream = pdb.stream total_size = 0 st = time() for sha in sha_list[:max_items]: stream = pdb_stream(sha) stream.read() total_size += stream.size elapsed = time() - st total_kib = total_size / 1000 print >> sys.stderr, "PDB: Obtained %i streams by sha and read all bytes totallying %i KiB ( %f KiB / s ) in %f s ( %f streams/s )" % ( max_items, total_kib, total_kib / elapsed, elapsed, max_items / elapsed)