Exemplo n.º 1
0
def payloadinfo(rpmfn):
    inode = defaultdict(list)
    r = rpm(rpmfn)
    for f in r.payloadinfo():
        # make sure we put file entries with digests at the beginning
        if f.digest:
            inode[f.stat.ino].insert(0, f)
        else:
            inode[f.stat.ino].append(f)
    return r.envra, [inode[i] for i in sorted(inode)]
Exemplo n.º 2
0
def rpmlister(dirs):
    '''list RPMs under topdir, grouped by .src.rpm name, sorted newest-oldest'''
    print("Finding RPMs, one moment..")
    rpmps = [p for d in dirs for p in Path(d).glob('**/*.rpm')]
    # read RPM headers and get source RPM tuple for each package
    srctup = dict()
    for p in progress(rpmps, prefix='Reading RPM headers ', itemfmt=lambda p: p.name):
        # TODO: we should also gather header/payload sizes and warn if we're
        # probably going to blow up 32-bit offsets. (Or, like.. auto-split
        # files at that point...)
        srctup[p] = rpm(p).srctup()
    src = rpm_src_groupsort(srctup.items())
    return {name:[p for pkgs in src[name].values() for p in pkgs] for name in src}
Exemplo n.º 3
0
 def get_rpmhdr(self, key):
     tup = self.rpmidx.get(key)
     if not tup:
         return None
     off, size = self.rpmidx.get(key)[0:2] # we don't use unc_size here
     self.rpmhdr.fobj.seek(off)
     hdr = self._unz.decompress(self.rpmhdr.fobj.read(size))
     r = rpm(hdrbytes=hdr)
     if len(hdr) > r.headersize:
         r.payload_order = struct.iter_unpack("I", hdr[r.headersize:])
     else:
         r.payload_order = None
     return r
Exemplo n.º 4
0
def gather_blob_sizes(repodir, skip_envras=None):
    if not skip_envras:
        skip_envras = set()

    blobsizes = dict()  # {digest:size}
    envrablobs = dict()  # {envra:[digest,...]}
    # TODO: fileclasses & compressed sizes?

    rpmiter = (rpm(rpmfn) for rpmfn in iter_repo_rpms(repodir))

    for r in progress(rpmiter, itemfmt=r.nevra):
        r = rpm(rpmfn)
        if r.envra in skip_envras:
            continue
        blobs = list()
        # in theory we should always have one digest for each regular
        # (non-ghost) file, so they'd pair back up with a sorted list of
        # filenames from another source (say filelists.xml.gz)
        for name, size, blob in blob_sizes(r):
            if blob:
                blobs.append(blob)
                blobsizes[blob] = size
        envrablobs[r.envra] = blobs
    return blobsizes, envrablobs
Exemplo n.º 5
0
def dump_deps(repo_paths, outfile="depdata.json.gz"):
    deps = dict()
    rpmcount = 0
    depcount = 0
    for rpmfn in progress(iter_repo_rpms(repo_paths), itemfmt=basename):
        r = rpm(rpmfn)
        # Skip duplicate ENVRAs
        if r.envra in deps:
            continue
        deps[r.envra] = r.alldeps()
        rpmcount += 1
        depcount += len(deps[r.envra])
    print("dumping {}...".format(outfile))
    with gzip.open(outfile, 'wt') as outf:
        o = OrderedDict()
        o['type'] = 'deps'
        o['version'] = 1
        o['counts'] = {'rpms': rpmcount, 'deps': depcount}
        o['deps'] = [{'envra': t, 'deps': d} for t, d in deps.items()]
        json.dump(o, outf)
    return deps
Exemplo n.º 6
0
def merge_rpms(rpmiter, outfile, **dino_kwargs):
    # Start with a new header object
    d = DINORPMArchive(**dino_kwargs)
    count, rpmsize, rpmtotal = 0, 0, 0

    # separate contexts for compressing headers vs. files.
    # TODO: it might be helpful if we made dictionaries for each?
    fzst = d.dino.get_compressor(level=d.compresslevel)
    hzst = d.dino.get_compressor(level=d.compresslevel)

    # Okay let's start adding some RPMs!
    if not verbose:
        rpmiter = progress(rpmiter, prefix=Path(outfile).name+': ', itemfmt=lambda p: p.name)
    for rpmfn in rpmiter:
        vprint(f'{rpmfn}:')
        r = rpm(rpmfn)

        # update stats
        count += 1
        rpmsize = r.payloadsize + r.headersize
        rpmtotal += rpmsize

        # We handle the files before the RPM header because while _nearly_
        # everything in the RPM payload can be reconstructed from the RPM
        # header itself, there are a couple tiny things that could be
        # different, like the ordering of the files in the archive.
        # NOTE: I'm almost sure we can reproduce the original _uncompressed_
        # payload, but I'm really not certain that we can get the exact
        # compression context (or timestamps or whatever else) are needed.

        # Grab the filenames and digests from the rpmhdr
        fnames = ["."+f for f in r.iterfiles()]
        rpmalgo = r.getval(Tag.FILEDIGESTALGO)
        digests = r.getval(Tag.FILEDIGESTS)

        # show RPM header/file sizes
        vprint(f'  RPM: hdr={r.headersize-0x60:<6} files={len(fnames):<3} filesize={r.payloadsize}'
               f' compr={r.payloadsize/unc_payloadsize(r):<6.2%}')


        # Keep track of the order of the files in the payload
        payload_in_order = True
        payload_order = []
        hdridx = {f:n for n,f in enumerate(fnames)}

        # Start running through the RPM payload
        filecount, filesize, unc_filesize = 0, 0, 0
        for n,item in enumerate(r.payload_iter()):
            # Does the payload name match the corresponding header name?
            # If not, find the header index for the payload filename.
            if item.name == fnames[n]:
                idx = n
            else:
                payload_in_order = False
                idx = hdridx[item.name]
            payload_order.append(idx)

            # We only store regular files with actual data
            if not (item.isreg and item.size):
                continue

            # Set up hashers
            hashers = {algo:gethasher(algo) for algo in (rpmalgo, d.idxalgo)}

            # Uncompress file, hash it, and write it to a temporary file.
            # If the calculated file key isn't in the index, compress the
            # temporary file contents into the filedata section.
            with SpooledTemporaryFile() as tmpf:
                # Uncompress and hash the file contents
                for block in item.get_blocks():
                    # TODO: parallelize? parallelize!
                    tmpf.write(block)
                    for h in hashers.values():
                        h.update(block)
                # Check digest to make sure the file is OK
                h = hashers[rpmalgo]
                if h.hexdigest() != digests[idx]:
                    act = h.hexdigest()
                    exp = digests[idx]
                    raise VerifyError(f"{fnames[idx]}: expected {exp}, got {act}")
                # Add this if it's not already in the fileidx
                filekey = hashers[d.idxalgo].digest()
                if filekey not in d.fileidx:
                    # Write file data into its own compressed frame.
                    tmpf.seek(0)
                    offset = d.filedata.fobj.tell()
                    usize, size = fzst.copy_stream(tmpf, d.filedata.fobj, size=item.size)
                    vprint(f"wrote {size} bytes to filedata sec at offset {offset}")
                    d.fileidx.add(filekey, offset, size, usize)
                    assert d.filedata.fobj.tell() == offset + size
                    filecount += 1
                    filesize += size
                    unc_filesize += item.size

        # Okay, files are added, now we can add the rpm header.
        # FIXME: we shouldn't have to do this manually..
        hdr = None
        with open(r.name, 'rb') as fobj:
            fobj.seek(0x60) # don't bother with the lead
            hdr = fobj.read(r.headersize-0x60)

        # Check signature header digest (if present)
        sigkey = r.sig.getval(SigTag.SHA256, '')
        if sigkey:
            h = gethasher(HashAlgo.SHA256)
            h.update(hdr[-r.hdr.size:])
            if sigkey != h.hexdigest():
                raise VerifyError(f"SHA256 mismatch in {r.name}: expected {sigkey} got {h.hexdigest()}")

        # Add the payload ordering
        if not payload_in_order:
            hdr += b''.join(struct.pack('I',i) for i in payload_order)

        # Add it to the rpmhdr section
        offset = d.rpmhdr.fobj.tell()
        usize, size = hzst.copy_stream(BytesIO(hdr), d.rpmhdr.fobj, size=len(hdr))
        assert d.rpmhdr.fobj.tell() == offset + size
        sizediff = (size+filesize)-rpmsize
        vprint(f' DINO: hdr={size:<6} files={filecount:<3} filesize={filesize}'
               f' {f"compr={filesize/unc_filesize:<6.2%}" if filesize else ""}'
               f' diff={sizediff:+} ({sizediff/rpmsize:+.1%})'
               f' {"(!)" if sizediff/rpmsize > 0.02 else ""}')

        # Generate pkgkey (TODO: maybe copy_into should do this..)
        # TODO: y'know, it might be more useful to use the sha256 of the
        # package envra - which, in theory, should also be unique, but also
        # gives us fast package lookups by name...
        #pkgid = hashlib.sha256(bytes(r.envra, 'utf8')).hexdigest()
        hasher = gethasher(d.idxalgo)
        hasher.update(hdr)
        pkgkey = hasher.digest()
        # Add package key to the index
        d.rpmidx.add(pkgkey, offset, size, usize)

    # We did it! Write the data to the output file!
    with open(outfile, 'wb') as outf:
        wrote = d.dino.write_to(outf)
    sizediff = wrote-rpmtotal
    print(f'packed {count} packages ({rpmtotal} bytes) into {outfile} '
          f'({sizediff/rpmtotal:<+.1%} -> {wrote} bytes)'
          f'{" (!)" if wrote > rpmtotal else ""}')
    return rpmtotal, wrote
Exemplo n.º 7
0
 def setUpClass(cls):
     cls._rpm = rpm(RPMFILE['fuse-common'])