def _download_rpm(rpm: Rpm, repo_url: str, rpm_table: RpmTable, cfg: DownloadConfig) -> Tuple[Rpm, str]: "Returns a storage_id and a copy of `rpm` with a canonical checksum." log.info(f"Downloading {rpm}") storage = cfg.new_storage() with download_resource(repo_url, rpm.location) as input_, storage.writer() as output: # Before committing to the DB, let's standardize on one hash # algorithm. Otherwise, it might happen that two repos may # store the same RPM hashed with different algorithms, and thus # trigger our "different hashes" detector for a sane RPM. canonical_hash = hashlib.new(CANONICAL_HASH) for chunk in verify_chunk_stream( read_chunks(input_, BUFFER_BYTES), [rpm.checksum], rpm.size, rpm.location): # May raise a ReportableError canonical_hash.update(chunk) output.write(chunk) # NB: We can also query the RPM as we download it above, via # something like P123285392. However, at present, all necessary # metadata can be retrieved via `parse_metadata.py`. rpm = rpm._replace(canonical_checksum=Checksum( algorithm=CANONICAL_HASH, hexdigest=canonical_hash.hexdigest())) storage_id = output.commit() assert storage_id is not None return rpm, storage_id
def _download_repodata( repodata: Repodata, *, repo_url: str, repodata_table: RepodataTable, cfg: DownloadConfig, is_primary: bool, ) -> DownloadRepodataReturnType: """This function behaves differently depending on two main characteristics: - Whether or not the provided repodata is primary, and - Whether or not it already exists in storage. Which actions are taken depends on which of the above true, and this branching is explained within the function. """ storage = cfg.new_storage() # We only need to download the repodata if is not already in the DB, # or if it is primary (so we can parse it for RPMs). with cfg.new_db_ctx(readonly=True) as ro_repo_db: storage_id = ro_repo_db.get_storage_id(repodata_table, repodata) # Nothing to do -- only need to download repodata if it's the primary # (so we can parse it for RPMs), or if it's not already in the DB. if not is_primary and storage_id: return DownloadRepodataReturnType(repodata, False, storage_id, None) rpms = [] if is_primary else None # Remaining possibilities are that we've got a primary with or without # a storage_id, or a non-primary without a storage_id with ExitStack() as cm: rpm_parser = None if is_primary: # We'll parse the selected primary file to discover the RPMs. rpm_parser = cm.enter_context(get_rpm_parser(repodata)) if storage_id: # Read the primary from storage as we already have an ID infile = cm.enter_context(storage.reader(storage_id)) # No need to write as this repodata was already stored outfile = None else: # Nothing stored, must download - can fail due to repo updates infile = cm.enter_context( download_resource(repo_url, repodata.location)) # Want to persist the downloaded repodata into storage so that # future runs don't need to redownload it outfile = cm.enter_context(storage.writer()) log.info(f"Fetching {repodata}") for chunk in verify_chunk_stream( read_chunks(infile, BUFFER_BYTES), [repodata.checksum], repodata.size, repodata.location, ): # May raise a ReportableError if outfile: outfile.write(chunk) if rpm_parser: try: rpms.extend(rpm_parser.feed(chunk)) except Exception as ex: raise RepodataParseError((repodata.location, ex)) # Must commit the output context to get a storage_id. if outfile: return DownloadRepodataReturnType(repodata, True, outfile.commit(), rpms) # The primary repodata was already stored, and we just parsed it for RPMs. assert storage_id is not None return DownloadRepodataReturnType(repodata, False, storage_id, rpms)
def get(args): storage = Storage.from_json(args.storage) with storage.reader(args.storage_id) as fin: for chunk in read_chunks(fin, _CHUNK_SIZE): args.to_file.write(chunk)
def put(args): storage = Storage.from_json(args.storage) with storage.writer() as fout: for chunk in read_chunks(args.from_file, _CHUNK_SIZE): fout.write(chunk) args.to_file.write((fout.commit() + '\n').encode())