def _assert_index_file(self, index, version, size): assert index.packfile_checksum() != index.indexfile_checksum() assert len(index.packfile_checksum()) == 20 assert len(index.indexfile_checksum()) == 20 assert index.version() == version assert index.size() == size assert len(index.offsets()) == size # get all data of all objects for oidx in xrange(index.size()): sha = index.sha(oidx) assert oidx == index.sha_to_index(sha) entry = index.entry(oidx) assert len(entry) == 3 assert entry[0] == index.offset(oidx) assert entry[1] == sha assert entry[2] == index.crc(oidx) # verify partial sha for l in (4, 8, 11, 17, 20): assert index.partial_sha_to_index(sha[:l], l * 2) == oidx # END for each object index in indexfile self.failUnlessRaises(ValueError, index.partial_sha_to_index, "\0", 2)
def _assert_object_writing_simple(self, db): # write a bunch of objects and query their streams and info null_objs = db.size() ni = 250 for i in xrange(ni): data = pack(">L", i) istream = IStream(str_blob_type, len(data), BytesIO(data)) new_istream = db.store(istream) assert new_istream is istream assert db.has_object(istream.binsha) info = db.info(istream.binsha) assert isinstance(info, OInfo) assert info.type == istream.type and info.size == istream.size stream = db.stream(istream.binsha) assert isinstance(stream, OStream) assert stream.binsha == info.binsha and stream.type == info.type assert stream.read() == data # END for each item assert db.size() == null_objs + ni shas = list(db.sha_iter()) assert len(shas) == db.size() assert len(shas[0]) == 20
def test_correctness(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # disabled for now as it used to work perfectly, checking big repositories takes a long time print("Endurance run: verify streaming of objects (crc and sha)", file=sys.stderr) for crc in range(2): count = 0 st = time() for entity in pdb.entities(): pack_verify = entity.is_valid_stream sha_by_index = entity.index().sha for index in xrange(entity.index().size()): try: assert pack_verify(sha_by_index(index), use_crc=crc) count += 1 except UnsupportedOperation: pass # END ignore old indices # END for each index # END for each entity elapsed = time() - st print( "PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % (count, crc, elapsed, count / elapsed), file=sys.stderr)
def _read_fanout(self, byte_offset): """Generate a fanout table from our data""" d = self._cursor.map() out = list() append = out.append for i in xrange(256): append(unpack_from('>L', d, byte_offset + i * 4)[0]) # END for each entry return out
def write(self, pack_sha, write): """Write the index file using the given write method :param pack_sha: binary sha over the whole pack that we index :return: sha1 binary sha over all index file contents""" # sort for sha1 hash self._objs.sort(key=lambda o: o[0]) sha_writer = FlexibleSha1Writer(write) sha_write = sha_writer.write sha_write(PackIndexFile.index_v2_signature) sha_write(pack(">L", PackIndexFile.index_version_default)) # fanout tmplist = list((0, ) * 256) # fanout or list with 64 bit offsets for t in self._objs: tmplist[byte_ord(t[0][0])] += 1 # END prepare fanout for i in xrange(255): v = tmplist[i] sha_write(pack('>L', v)) tmplist[i + 1] += v # END write each fanout entry sha_write(pack('>L', tmplist[255])) # sha1 ordered # save calls, that is push them into c sha_write(b''.join(t[0] for t in self._objs)) # crc32 for t in self._objs: sha_write(pack('>L', t[1] & 0xffffffff)) # END for each crc tmplist = list() # offset 32 for t in self._objs: ofs = t[2] if ofs > 0x7fffffff: tmplist.append(ofs) ofs = 0x80000000 + len(tmplist) - 1 # END hande 64 bit offsets sha_write(pack('>L', ofs & 0xffffffff)) # END for each offset # offset 64 for ofs in tmplist: sha_write(pack(">Q", ofs)) # END for each offset # trailer assert (len(pack_sha) == 20) sha_write(pack_sha) sha = sha_writer.sha(as_hex=False) write(sha) return sha
def write(self, pack_sha, write): """Write the index file using the given write method :param pack_sha: binary sha over the whole pack that we index :return: sha1 binary sha over all index file contents""" # sort for sha1 hash self._objs.sort(key=lambda o: o[0]) sha_writer = FlexibleSha1Writer(write) sha_write = sha_writer.write sha_write(PackIndexFile.index_v2_signature) sha_write(pack(">L", PackIndexFile.index_version_default)) # fanout tmplist = list((0,) * 256) # fanout or list with 64 bit offsets for t in self._objs: tmplist[byte_ord(t[0][0])] += 1 # END prepare fanout for i in xrange(255): v = tmplist[i] sha_write(pack('>L', v)) tmplist[i + 1] += v # END write each fanout entry sha_write(pack('>L', tmplist[255])) # sha1 ordered # save calls, that is push them into c sha_write(b''.join(t[0] for t in self._objs)) # crc32 for t in self._objs: sha_write(pack('>L', t[1] & 0xffffffff)) # END for each crc tmplist = list() # offset 32 for t in self._objs: ofs = t[2] if ofs > 0x7fffffff: tmplist.append(ofs) ofs = 0x80000000 + len(tmplist) - 1 # END hande 64 bit offsets sha_write(pack('>L', ofs & 0xffffffff)) # END for each offset # offset 64 for ofs in tmplist: sha_write(pack(">Q", ofs)) # END for each offset # trailer assert(len(pack_sha) == 20) sha_write(pack_sha) sha = sha_writer.sha(as_hex=False) write(sha) return sha
def make_bytes(size_in_bytes, randomize=False): """:return: string with given size in bytes :param randomize: try to produce a very random stream""" actual_size = size_in_bytes // 4 producer = xrange(actual_size) if randomize: producer = list(producer) random.shuffle(producer) # END randomize a = array('i', producer) return a.tostring()
def make_bytes(size_in_bytes, randomize=False): """:return: string with given size in bytes :param randomize: try to produce a very random stream""" actual_size = size_in_bytes // 4 producer = xrange(actual_size) if randomize: producer = list(producer) random.shuffle(producer) # END randomize a = array('i', producer) return a.tobytes()
def offsets(self): """:return: sequence of all offsets in the order in which they were written **Note:** return value can be random accessed, but may be immmutable""" if self._version == 2: # read stream to array, convert to tuple a = array.array('I') # 4 byte unsigned int, long are 8 byte on 64 bit it appears a.fromstring(buffer(self._cursor.map(), self._pack_offset, self._pack_64_offset - self._pack_offset)) # networkbyteorder to something array likes more if sys.byteorder == 'little': a.byteswap() return a else: return tuple(self.offset(index) for index in xrange(self.size()))
def compress(self): """Alter the list to reduce the amount of nodes. Currently we concatenate add-chunks :return: self""" slen = len(self) if slen < 2: return self i = 0 first_data_index = None while i < slen: dc = self[i] i += 1 if dc.data is None: if first_data_index is not None and i - 2 - first_data_index > 1: # if first_data_index is not None: nd = StringIO() # new data so = self[ first_data_index].to # start offset in target buffer for x in xrange(first_data_index, i - 1): xdc = self[x] nd.write(xdc.data[:xdc.ts]) # END collect data del (self[first_data_index:i - 1]) buf = nd.getvalue() self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) slen = len(self) i = first_data_index + 1 # END concatenate data first_data_index = None continue # END skip non-data chunks if first_data_index is None: first_data_index = i - 1 # END iterate list # if slen_orig != len(self): # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100) return self
def compress(self): """Alter the list to reduce the amount of nodes. Currently we concatenate add-chunks :return: self""" slen = len(self) if slen < 2: return self i = 0 first_data_index = None while i < slen: dc = self[i] i += 1 if dc.data is None: if first_data_index is not None and i - 2 - first_data_index > 1: # if first_data_index is not None: nd = StringIO() # new data so = self[first_data_index].to # start offset in target buffer for x in xrange(first_data_index, i - 1): xdc = self[x] nd.write(xdc.data[:xdc.ts]) # END collect data del(self[first_data_index:i - 1]) buf = nd.getvalue() self.insert(first_data_index, DeltaChunk(so, len(buf), 0, buf)) slen = len(self) i = first_data_index + 1 # END concatenate data first_data_index = None continue # END skip non-data chunks if first_data_index is None: first_data_index = i - 1 # END iterate list # if slen_orig != len(self): # print "INFO: Reduced delta list len to %f %% of former size" % ((float(len(self)) / slen_orig) * 100) return self
def test_correctness(self): pdb = PackedDB(os.path.join(self.gitrepopath, "objects/pack")) # disabled for now as it used to work perfectly, checking big repositories takes a long time print("Endurance run: verify streaming of objects (crc and sha)", file=sys.stderr) for crc in range(2): count = 0 st = time() for entity in pdb.entities(): pack_verify = entity.is_valid_stream sha_by_index = entity.index().sha for index in xrange(entity.index().size()): try: assert pack_verify(sha_by_index(index), use_crc=crc) count += 1 except UnsupportedOperation: pass # END ignore old indices # END for each index # END for each entity elapsed = time() - st print("PDB: verified %i objects (crc=%i) in %f s ( %f objects/s )" % (count, crc, elapsed, count / elapsed), file=sys.stderr)
def sha_iter(self): for entity in self.entities(): index = entity.index() sha_by_index = index.sha for index in xrange(index.size()): yield sha_by_index(index)
def _iter_objects(self, as_stream): """Iterate over all objects in our index and yield their OInfo or OStream instences""" _sha = self._index.sha _object = self._object for index in xrange(self._index.size()): yield _object(_sha(index), as_stream, index)