def payloadinfo(rpmfn): inode = defaultdict(list) r = rpm(rpmfn) for f in r.payloadinfo(): # make sure we put file entries with digests at the beginning if f.digest: inode[f.stat.ino].insert(0, f) else: inode[f.stat.ino].append(f) return r.envra, [inode[i] for i in sorted(inode)]
def rpmlister(dirs): '''list RPMs under topdir, grouped by .src.rpm name, sorted newest-oldest''' print("Finding RPMs, one moment..") rpmps = [p for d in dirs for p in Path(d).glob('**/*.rpm')] # read RPM headers and get source RPM tuple for each package srctup = dict() for p in progress(rpmps, prefix='Reading RPM headers ', itemfmt=lambda p: p.name): # TODO: we should also gather header/payload sizes and warn if we're # probably going to blow up 32-bit offsets. (Or, like.. auto-split # files at that point...) srctup[p] = rpm(p).srctup() src = rpm_src_groupsort(srctup.items()) return {name:[p for pkgs in src[name].values() for p in pkgs] for name in src}
def get_rpmhdr(self, key): tup = self.rpmidx.get(key) if not tup: return None off, size = self.rpmidx.get(key)[0:2] # we don't use unc_size here self.rpmhdr.fobj.seek(off) hdr = self._unz.decompress(self.rpmhdr.fobj.read(size)) r = rpm(hdrbytes=hdr) if len(hdr) > r.headersize: r.payload_order = struct.iter_unpack("I", hdr[r.headersize:]) else: r.payload_order = None return r
def gather_blob_sizes(repodir, skip_envras=None): if not skip_envras: skip_envras = set() blobsizes = dict() # {digest:size} envrablobs = dict() # {envra:[digest,...]} # TODO: fileclasses & compressed sizes? rpmiter = (rpm(rpmfn) for rpmfn in iter_repo_rpms(repodir)) for r in progress(rpmiter, itemfmt=r.nevra): r = rpm(rpmfn) if r.envra in skip_envras: continue blobs = list() # in theory we should always have one digest for each regular # (non-ghost) file, so they'd pair back up with a sorted list of # filenames from another source (say filelists.xml.gz) for name, size, blob in blob_sizes(r): if blob: blobs.append(blob) blobsizes[blob] = size envrablobs[r.envra] = blobs return blobsizes, envrablobs
def dump_deps(repo_paths, outfile="depdata.json.gz"): deps = dict() rpmcount = 0 depcount = 0 for rpmfn in progress(iter_repo_rpms(repo_paths), itemfmt=basename): r = rpm(rpmfn) # Skip duplicate ENVRAs if r.envra in deps: continue deps[r.envra] = r.alldeps() rpmcount += 1 depcount += len(deps[r.envra]) print("dumping {}...".format(outfile)) with gzip.open(outfile, 'wt') as outf: o = OrderedDict() o['type'] = 'deps' o['version'] = 1 o['counts'] = {'rpms': rpmcount, 'deps': depcount} o['deps'] = [{'envra': t, 'deps': d} for t, d in deps.items()] json.dump(o, outf) return deps
def merge_rpms(rpmiter, outfile, **dino_kwargs): # Start with a new header object d = DINORPMArchive(**dino_kwargs) count, rpmsize, rpmtotal = 0, 0, 0 # separate contexts for compressing headers vs. files. # TODO: it might be helpful if we made dictionaries for each? fzst = d.dino.get_compressor(level=d.compresslevel) hzst = d.dino.get_compressor(level=d.compresslevel) # Okay let's start adding some RPMs! if not verbose: rpmiter = progress(rpmiter, prefix=Path(outfile).name+': ', itemfmt=lambda p: p.name) for rpmfn in rpmiter: vprint(f'{rpmfn}:') r = rpm(rpmfn) # update stats count += 1 rpmsize = r.payloadsize + r.headersize rpmtotal += rpmsize # We handle the files before the RPM header because while _nearly_ # everything in the RPM payload can be reconstructed from the RPM # header itself, there are a couple tiny things that could be # different, like the ordering of the files in the archive. # NOTE: I'm almost sure we can reproduce the original _uncompressed_ # payload, but I'm really not certain that we can get the exact # compression context (or timestamps or whatever else) are needed. # Grab the filenames and digests from the rpmhdr fnames = ["."+f for f in r.iterfiles()] rpmalgo = r.getval(Tag.FILEDIGESTALGO) digests = r.getval(Tag.FILEDIGESTS) # show RPM header/file sizes vprint(f' RPM: hdr={r.headersize-0x60:<6} files={len(fnames):<3} filesize={r.payloadsize}' f' compr={r.payloadsize/unc_payloadsize(r):<6.2%}') # Keep track of the order of the files in the payload payload_in_order = True payload_order = [] hdridx = {f:n for n,f in enumerate(fnames)} # Start running through the RPM payload filecount, filesize, unc_filesize = 0, 0, 0 for n,item in enumerate(r.payload_iter()): # Does the payload name match the corresponding header name? # If not, find the header index for the payload filename. if item.name == fnames[n]: idx = n else: payload_in_order = False idx = hdridx[item.name] payload_order.append(idx) # We only store regular files with actual data if not (item.isreg and item.size): continue # Set up hashers hashers = {algo:gethasher(algo) for algo in (rpmalgo, d.idxalgo)} # Uncompress file, hash it, and write it to a temporary file. # If the calculated file key isn't in the index, compress the # temporary file contents into the filedata section. with SpooledTemporaryFile() as tmpf: # Uncompress and hash the file contents for block in item.get_blocks(): # TODO: parallelize? parallelize! tmpf.write(block) for h in hashers.values(): h.update(block) # Check digest to make sure the file is OK h = hashers[rpmalgo] if h.hexdigest() != digests[idx]: act = h.hexdigest() exp = digests[idx] raise VerifyError(f"{fnames[idx]}: expected {exp}, got {act}") # Add this if it's not already in the fileidx filekey = hashers[d.idxalgo].digest() if filekey not in d.fileidx: # Write file data into its own compressed frame. tmpf.seek(0) offset = d.filedata.fobj.tell() usize, size = fzst.copy_stream(tmpf, d.filedata.fobj, size=item.size) vprint(f"wrote {size} bytes to filedata sec at offset {offset}") d.fileidx.add(filekey, offset, size, usize) assert d.filedata.fobj.tell() == offset + size filecount += 1 filesize += size unc_filesize += item.size # Okay, files are added, now we can add the rpm header. # FIXME: we shouldn't have to do this manually.. hdr = None with open(r.name, 'rb') as fobj: fobj.seek(0x60) # don't bother with the lead hdr = fobj.read(r.headersize-0x60) # Check signature header digest (if present) sigkey = r.sig.getval(SigTag.SHA256, '') if sigkey: h = gethasher(HashAlgo.SHA256) h.update(hdr[-r.hdr.size:]) if sigkey != h.hexdigest(): raise VerifyError(f"SHA256 mismatch in {r.name}: expected {sigkey} got {h.hexdigest()}") # Add the payload ordering if not payload_in_order: hdr += b''.join(struct.pack('I',i) for i in payload_order) # Add it to the rpmhdr section offset = d.rpmhdr.fobj.tell() usize, size = hzst.copy_stream(BytesIO(hdr), d.rpmhdr.fobj, size=len(hdr)) assert d.rpmhdr.fobj.tell() == offset + size sizediff = (size+filesize)-rpmsize vprint(f' DINO: hdr={size:<6} files={filecount:<3} filesize={filesize}' f' {f"compr={filesize/unc_filesize:<6.2%}" if filesize else ""}' f' diff={sizediff:+} ({sizediff/rpmsize:+.1%})' f' {"(!)" if sizediff/rpmsize > 0.02 else ""}') # Generate pkgkey (TODO: maybe copy_into should do this..) # TODO: y'know, it might be more useful to use the sha256 of the # package envra - which, in theory, should also be unique, but also # gives us fast package lookups by name... #pkgid = hashlib.sha256(bytes(r.envra, 'utf8')).hexdigest() hasher = gethasher(d.idxalgo) hasher.update(hdr) pkgkey = hasher.digest() # Add package key to the index d.rpmidx.add(pkgkey, offset, size, usize) # We did it! Write the data to the output file! with open(outfile, 'wb') as outf: wrote = d.dino.write_to(outf) sizediff = wrote-rpmtotal print(f'packed {count} packages ({rpmtotal} bytes) into {outfile} ' f'({sizediff/rpmtotal:<+.1%} -> {wrote} bytes)' f'{" (!)" if wrote > rpmtotal else ""}') return rpmtotal, wrote
def setUpClass(cls): cls._rpm = rpm(RPMFILE['fuse-common'])