def in_shard(self, rpm): # Our contract is that the RPM filename is the global primary key, # # We use the last 8 bytes of SHA1, since we need a deterministic # hash for parallel downloads, and Python standard library lacks # fast non-cryptographic hashes like CityHash or SpookyHashV2. # adler32 is faster, but way too collision-prone to bother. h, = _UINT64_STRUCT.unpack_from( hashlib.sha1(byteme(rpm.filename())).digest(), 12) return h % self.modulo == self.shard
def __init__(self, path: AnyStr, already_exists=False): ''' `Subvol` can represent not-yet-created subvolumes. Unless already_exists=True, you must call create() or snapshot() to actually make the subvolume. ''' self._path = os.path.abspath(byteme(path)) self._exists = already_exists if self._exists and not _path_is_btrfs_subvol(self._path): raise AssertionError(f'No btrfs subvol at {self._path}')
def __init__( self, unshare: Optional[Unshare], image_path: bytes, size_bytes: int, ): self._unshare = unshare self._temp_dir_ctx = tempfile.TemporaryDirectory() # noqa: P201 self._size_bytes = size_bytes self._image_path = byteme(os.path.abspath(image_path)) self._temp_dir: Optional[bytes] = None self._mount_dir: Optional[bytes] = None
def path( self, path_in_subvol: AnyStr = b'.', *, no_dereference_leaf=False, ) -> bytes: ''' The only safe way to access paths inside the subvolume. Do NOT `os.path.join(subvol.path('a/path'), 'more/path')`, since that skips crucial safety checks. Instead: `subvol.path(os.path.join(...))`. This code has checks to mitigate two risks: - `path_in_subvol` is relative, and exits the subvolume via '..' - Some component of the path is a symlink, and this symlink, when interpreted by a non-chrooted tool, will attempt to access something outside of the subvolume. At present, the above check fail on attempting to traverse an in-subvolume symlink that is an absolute path to another directory within the subvolume, but support could easily be added. It is not supported now because at present, I believe that the right idiom is to encourage image authors to manipulate the "real" locations of files, and not to manipulate paths through symlinks. In the rare case that you need to manipulate a symlink itself (e.g. remove or rename), you will want to pass `no_dereference_leaf`. Future: consider using a file descriptor to refer to the subvolume root directory to better mitigate races due to renames in its path. ''' # The `btrfs` CLI is not very flexible, so it will try to name a # subvol '.' if we do not normalize `/subvol/.`. result_path = os.path.normpath( os.path.join( self._path, # Without the lstrip, we would lose the subvolume prefix if the # supplied path is absolute. byteme(path_in_subvol).lstrip(b'/'), )) # Paranoia: Make sure that, despite any symlinks in the path, the # resulting path is not outside of the subvolume root. # # NB: This will prevent us from even accessing symlinks created # inside the subvolume. To fix this, we should add an OPTION not to # follow the LAST component of the path. root_relative = os.path.relpath((os.path.join( os.path.realpath(os.path.dirname(result_path)), os.path.basename(result_path), ) if no_dereference_leaf else os.path.realpath(result_path)), os.path.realpath(self._path)) if root_relative.startswith(b'../') or root_relative == b'..': raise AssertionError(f'{path_in_subvol} is outside the subvol') return result_path
def __enter__(self) -> 'LoopbackVolume': self._temp_dir = byteme(os.path.abspath( self._temp_dir_ctx.__enter__())) try: self._size_bytes = _format_image_file(self._image_path, self._size_bytes) self._mount_dir = os.path.join(self._temp_dir, b'volume') os.mkdir(self._mount_dir) self._loop_dev = _mount_image_file( self._unshare, self._image_path, self._mount_dir, ) except BaseException: self.__exit__(*sys.exc_info()) raise return self
def store_repomd( self, universe: str, repo: str, repomd: RepoMetadata, ) -> int: 'Returns the inserted `fetch_timestamp`, ours or from a racing writer' validate_universe_name(universe) with self._cursor() as cursor: fts = repomd.fetch_timestamp bts = repomd.build_timestamp checksum = str(repomd.checksum) repomd_xml = byteme(repomd.xml) # Future: We could start with a sanity check like below. I'm # not sure of its value, though, and it would slow us down. # # for repodata in repomd.repodatas: # assert repodata.checksum() in DB p = self._placeholder() cursor.execute( f''' INSERT {self._or_ignore()} INTO `repo_metadata` ( `universe`, `repo`, `fetch_timestamp`, `build_timestamp`, `checksum`, `xml` ) VALUES ({p}, {p}, {p}, {p}, {p}, {p}); ''', (universe, repo, fts, bts, checksum, repomd_xml)) if cursor.rowcount: return fts # Our timestamp was the one that got inserted. # We lost the race, so ensure the prior data agrees with ours. # We don't need to check `build_timestamp`, it comes from `xml`. cursor.execute( f''' SELECT `fetch_timestamp`, `xml` FROM `repo_metadata` WHERE (`universe` = {p} AND `repo` = {p} AND `checksum` = {p}); ''', (universe, repo, checksum)) (db_fts, db_repomd_xml), = cursor.fetchall() # Allow a generous 1 minute of clock skew assert fts + 60 >= db_fts, f'{fts} + 60 < {db_fts}' assert repomd_xml == db_repomd_xml, f'{repomd_xml} {db_repomd_xml}' return db_fts