def test_tar_experiment_download(self): self.assertTrue(all(df.verified for df in self.dfs)) response = self.client.get(reverse( 'tardis.tardis_portal.download.streaming_download_experiment', args=(self.exp.id, 'tar'))) with NamedTemporaryFile('w') as tarfile: for c in response.streaming_content: tarfile.write(c) tarfile.flush() self.assertEqual(int(response['Content-Length']), os.stat(tarfile.name).st_size) tf = TarFile(tarfile.name) if settings.EXP_SPACES_TO_UNDERSCORES: exp_title = self.exp.title.replace(' ', '_') else: exp_title = self.exp.title exp_title = quote(exp_title, safe=settings.SAFE_FILESYSTEM_CHARACTERS) for df in self.dfs: full_path = os.path.join( exp_title, quote(self.ds.description, safe=settings.SAFE_FILESYSTEM_CHARACTERS), df.directory, df.filename) # docker has a file path limit of ~240 characters if os.environ.get('DOCKER_BUILD', 'false') != 'true': tf.extract(full_path, '/tmp') self.assertEqual( os.stat(os.path.join('/tmp', full_path)).st_size, int(df.size))
def test_tar_experiment_download(self): self.assertTrue(all(df.verified for df in self.dfs)) response = self.client.get(reverse( 'tardis.tardis_portal.download.streaming_download_experiment', args=(self.exp.id, 'tar'))) with NamedTemporaryFile('w') as tarfile: for c in response.streaming_content: tarfile.write(c) tarfile.flush() self.assertEqual(int(response['Content-Length']), os.stat(tarfile.name).st_size) tf = TarFile(tarfile.name) if settings.EXP_SPACES_TO_UNDERSCORES: exp_title = self.exp.title.replace(' ', '_') else: exp_title = self.exp.title exp_title = urllib.parse.quote(exp_title, safe=settings.SAFE_FILESYSTEM_CHARACTERS) for df in self.dfs: full_path = os.path.join( exp_title, urllib.parse.quote(self.ds.description, safe=settings.SAFE_FILESYSTEM_CHARACTERS), df.directory, df.filename) # docker has a file path limit of ~240 characters if os.environ.get('DOCKER_BUILD', 'false') != 'true': tf.extract(full_path, '/tmp') self.assertEqual( os.stat(os.path.join('/tmp', full_path)).st_size, int(df.size))
def _extract_files_by_provided_names_tar(t_file: tarfile.TarFile, names: List[str], output_path: str) -> None: # TODO: write description for name in names: member = t_file.getmember(name) extracting_path = output_path t_file.extract(member, path=extracting_path)
def extract(self, path, cb=None): tarfile = TarFile(fileobj=self.source.extractfile('./DATA')) for member in tarfile.getmembers(): if member.name in ( '/', ''): # extract can't handle making '/' when installing '/' continue tarfile.extract(member, path) if member.isfile() and cb: cb(self.name, os.path.join(path, member.name))
def _extract_tar_junk_path(tarfile_obj: tarfile.TarFile, archive_extract_dir: Path): """ Extract a tarfile while flattening any directory hierarchy in the archive. """ for member in tarfile_obj.getmembers(): if member.isdir(): # Skip directories continue # Remove the directory hierarchy from the file member.name = Path(member.name).name output_file = archive_extract_dir / member.name LOGGER.debug(f"Extracting member '{member.name}' to '{output_file}'") tarfile_obj.extract(member, path=archive_extract_dir)
def _untar_layers(dir, layers): output = {} # Untar layer filesystem bundle for layer in layers: tarfile = TarFile(dir + "/" + layer) for member in tarfile.getmembers(): output[member.name] = member for member_name in output: try: tarfile.extract(output[member_name], path=dir, set_attrs=False) except (ValueError, ReadError): pass # Clean up for layer in layers: clean_up(dir + "/" + layer[:-10])
def unpack_archive(archive_staging_dir, archive, external_id, target_path, filelist=None): """Unpack a tar file containing the files that are in the MigrationArchive object""" # create the name of the archive archive_path = archive.get_archive_name(archive_staging_dir) # create the target directory if it doesn't exist try: os.makedirs(target_path) except: pass try: tar_file = TarFile(archive_path, 'r') # check that the tar_file digest matches the digest in the database digest = calculate_digest(archive_path) if digest != archive.digest: error_string = ( "Digest does not match for archive: {}").format(archive_path) raise Exception(error_string) except: error_string = ("Could not find archive path: {}").format(archive_path) raise Exception(error_string) # untar each file for tar_info in tar_file.getmembers(): try: # if filelist only extract those in the filelist if filelist: if tar_info.name in filelist: tar_file.extract(tar_info, path=target_path) else: tar_file.extract(tar_info, path=target_path) logging.debug( (" Extracting file: {} from archive: {} to directory: {}" ).format(tar_info.name, archive.get_id(), target_path)) except Exception as e: error_string = ( "Could not extract file: {} from archive {} to path: {}, exception: {}" ).format(tar_info.name, archive.get_id(), target_path, str(e)) logging.error(error_string) raise Exception(error_string) tar_file.close()
def test_tar_experiment_download(self): self.assertTrue(all(df.verified for df in self.dfs)) response = self.client.get( reverse("tardis.tardis_portal.download.streaming_download_experiment", args=(self.exp.id, "tar")) ) with NamedTemporaryFile("w") as tarfile: for c in response.streaming_content: tarfile.write(c) tarfile.flush() self.assertEqual(int(response["Content-Length"]), os.stat(tarfile.name).st_size) tf = TarFile(tarfile.name) for df in self.dfs: full_path = os.path.join( self.exp.title.replace(" ", "_"), self.ds.description, df.directory, df.filename ) tf.extract(full_path, "/tmp") self.assertEqual(os.stat(os.path.join("/tmp", full_path)).st_size, int(df.size))
def extract(self, path, cb=None): file_path_list = [] tarfile = TarFile(fileobj=self.source.extractfile('./DATA')) for member in tarfile.getmembers(): if member.name in ( '/', ''): # extract can't handle making '/' when installing '/' continue file_path = os.path.join(path, member.name) if cb is not None: cb(file_path) tarfile.extract(member, path) if member.isfile(): file_path_list.append(file_path) return file_path_list
class FileSeekerTar(FileSeekerBase): def __init__(self, tar_file_path, temp_folder): FileSeekerBase.__init__(self) self.tar_file = TarFile(tar_file_path) self.temp_folder = temp_folder def search(self, filepattern): pathlist = [] for member in self.tar_file.getmembers(): if fnmatch.fnmatch(member.name, filepattern): try: self.tar_file.extract(member.name, path=self.temp_folder) pathlist.append( os.path.join(self.temp_folder, Path(member.name))) except: logfunc('Could not write file to filesystem') return pathlist def cleanup(self): self.tar_file.close()
def _untar_layers(dir, layers): output = {} # Untar layer filesystem bundle for layer in layers: tarfile = TarFile(dir + "/" + layer) for member in tarfile.getmembers(): try: tarfile.extract(member, path=dir, set_attrs=False) except (ValueError, ReadError) as ex: if InternalServer.is_debug_logging_enabled(): message = "Unexpected exception of type {0} occurred while untaring the docker image: {1!r}" \ .format(type(ex).__name__, ex.get_message() if type(ex).__name__ == 'DagdaError' else ex.args) DagdaLogger.get_logger().debug(message) except PermissionError as ex: message = "Unexpected error occurred while untaring the docker image: " + \ "Operation not permitted on {0!r}".format(member.name) DagdaLogger.get_logger().warn(message) # Clean up for layer in layers: clean_up(dir + "/" + layer[:-10])
def _extract_tar(archivefile: tarfile.TarFile, name: str, rep: str) -> bool: """Extract a tar archive """ mkdir(rep, mode=0o711) if not check_archive(archivefile, rep): raise BadArchive("malicious archive") try: for member in archivefile: archivefile.extract(member, rep, set_attrs=False) member_location = joinpath(rep, member.name) # python has no option to use umask while extracting, so… if isdir(member_location): chmod(member_location, 0o711) else: chmod(member_location, 0o644) except: # extraction failed, remove leftover files log.info('Extraction of %s failed, falling back to single-file upload', name, exc_info=True) rmtree(rep) return False else: # remove old tar file remove(name) log.info('Successfully extracted tarfile %s into %s', name, rep) return True
def extract_tar_stream(tar: tarfile.TarFile, src: Text, dst: Text) -> None: for member in tar: if os.path.isdir(dst): if posixpath.join('/', member.path) == src: member.path = posixpath.basename(member.path) tar.extract(member, dst) if member.isdir(): dst = os.path.join(dst, member.path) else: member.path = posixpath.relpath( posixpath.join('/', member.path), src) tar.extract(member, dst) elif member.isfile(): with tar.extractfile(member) as inputfile: with open(dst, 'wb') as outputfile: outputfile.write(inputfile.read()) else: parent_dir = str(Path(dst).parent) member.path = posixpath.basename(member.path) tar.extract(member, parent_dir)
def extract(self, package_filename, location, path): tarfile = TarFile(package_filename, "r") tarfile.extract(location, path)
def _tar_extract(archive: TarFile, member: Union[str, TarInfo], outdir: PathType) -> Path: archive.extract(member, outdir) member_name = member.name if isinstance(member, TarInfo) else member return Path(outdir) / member_name
def process_rpm( cfg: Config, s3session: s3.S3ServiceResource, tf: tarfile.TarFile, metadata: dict[str, Any], temp_dir: pathlib.Path, local_dir: pathlib.Path, ) -> None: bucket = s3session.Bucket(BUCKET) incoming_dir = temp_dir / "incoming" incoming_dir.mkdir() local_rpm_dir = local_dir / "rpm" local_rpm_dir.mkdir(parents=True, exist_ok=True) index_dir = local_rpm_dir / ".jsonindexes" index_dir.mkdir(exist_ok=True) rpms = [] for member in tf.getmembers(): if member.name in {".", "build-metadata.json"}: continue tf.extract(member, incoming_dir) fn = pathlib.Path(member.name) if fn.suffix == ".rpm": rpms.append(fn) dist = metadata["dist"] channel = metadata["channel"] arch = metadata["architecture"] idx = dist if channel != "stable": idx += f".{channel}" dist_dir = pathlib.Path(dist) / channel / arch local_dist_dir = local_rpm_dir / dist_dir local_dist_dir.mkdir(parents=True, exist_ok=True) sync_to_local( bucket, pathlib.Path("/rpm") / dist_dir, local_dist_dir, exact_timestamps=True, ) sync_to_local( bucket, pathlib.Path("/rpm") / ".jsonindexes", index_dir, exact_timestamps=True, ) repomd = local_dist_dir / "repodata" / "repomd.xml" if not repomd.exists(): subprocess.run( [ "createrepo_c", "--database", local_dist_dir, ], cwd=incoming_dir, check=True, ) for rpm in rpms: subprocess.run( [ "rpm", "--resign", rpm, ], input=b"\n", cwd=incoming_dir, check=True, ) shutil.copy(incoming_dir / rpm, local_dist_dir / rpm) subprocess.run( [ "createrepo_c", "--update", local_dist_dir, ], check=True, ) gpg_detach_sign(repomd) existing: dict[tuple[str, str], Package] = {} packages: dict[tuple[str, str], Package] = {} idxfile = index_dir / f"{idx}.json" if idxfile.exists(): with open(idxfile, "r") as f: data = json.load(f) if isinstance(data, dict) and (pkglist := data.get("packages")): for pkg in pkglist: index_key = (pkg["basename"], pkg["version_key"]) existing[index_key] = Package(**pkg)
def process_generic( cfg: Config, s3session: s3.S3ServiceResource, tf: tarfile.TarFile, metadata: dict[str, Any], temp_dir: pathlib.Path, local_dir: pathlib.Path, ) -> None: bucket = s3session.Bucket(BUCKET) pkg_directories = set() rrules = {} basename = metadata["name"] slot = metadata.get("version_slot") slot_suf = f"-{slot}" if slot else "" channel = metadata["channel"] channel_suf = f".{channel}" if channel and channel != "stable" else "" target = metadata["target"] contents = metadata["contents"] pkg_dir = f"{target}{channel_suf}" pkg_directories.add(pkg_dir) staging_dir = temp_dir / pkg_dir os.makedirs(staging_dir) for member in tf.getmembers(): if member.name in {".", "build-metadata.json"}: continue leaf = pathlib.Path(member.name) tf.extract(member, staging_dir) desc = contents[member.name] ext = desc["suffix"] asc_path = gpg_detach_sign(staging_dir / leaf) sha256_path = sha256(staging_dir / leaf) blake2b_path = blake2b(staging_dir / leaf) metadata_path = staging_dir / f"{leaf}.metadata.json" with open(metadata_path, "w") as f: json.dump(metadata, f) print(f"metadata={metadata}") print(f"target={target} leaf={leaf}") print(f"basename={basename} slot={slot}") print(f"channel={channel} pkg_dir={pkg_dir}") print(f"ext={ext}") # Store the fully-qualified artifact to archive/ archive_dir = ARCHIVE / pkg_dir put(bucket, staging_dir / leaf, archive_dir, cache=True) put(bucket, asc_path, archive_dir, cache=True) put(bucket, sha256_path, archive_dir, cache=True) put(bucket, blake2b_path, archive_dir, cache=True) put(bucket, metadata_path, archive_dir, cache=True) if metadata.get("publish_link_to_latest"): # And record a copy of it in the dist/ directory as an # unversioned key for ease of reference in download # scripts. Note: the archive/ entry is cached, but the # dist/ entry MUST NOT be cached for obvious reasons. # However, we still want the benefit of CDN for it, so # we generate a bucket-wide redirect policy for the # dist/ object to point to the archive/ object. See # below for details. target_dir = DIST / pkg_dir dist_name = f"{basename}{slot_suf}{ext}" put(bucket, b"", target_dir, name=dist_name) asc_name = f"{dist_name}.asc" put(bucket, b"", target_dir, name=asc_name) sha_name = f"{dist_name}.sha256" put(bucket, b"", target_dir, name=sha_name) sha_name = f"{dist_name}.blake2b" put(bucket, b"", target_dir, name=sha_name) rrules[target_dir / dist_name] = archive_dir / leaf for pkg_dir in pkg_directories: remove_old(bucket, ARCHIVE / pkg_dir, keep=1, channel="nightly") make_generic_index(bucket, ARCHIVE, pkg_dir) if rrules: # We can't use per-object redirects, because in that case S3 # generates the `301 Moved Permanently` response, and, adding # insult to injury, forgets to send the `Cache-Control` header, # which makes the response cacheable and useless for the purpose. # Luckily the "website" functionality of the bucket allows setting # redirection rules centrally, so that's what we do. # # The redirection rules are key prefix-based, and so we can use just # one redirect rule to handle both the main artifact and its # accompanying signature and checksum files. # # NOTE: Amazon S3 has a limitation of 50 routing rules per # website configuration. website = s3session.BucketWebsite(BUCKET) existing_rrules = list(website.routing_rules) for src, tgt in rrules.items(): src_key = str(src) tgt_key = str(tgt) for rule in existing_rrules: condition = rule.get("Condition") if not condition: continue if condition.get("KeyPrefixEquals") == src_key: try: redirect = rule["Redirect"] except KeyError: redirect = rule["Redirect"] = {} redirect["ReplaceKeyPrefixWith"] = tgt_key redirect["HttpRedirectCode"] = "307" break else: existing_rrules.append({ "Condition": { "KeyPrefixEquals": src_key, }, "Redirect": { "HttpRedirectCode": "307", "Protocol": "https", "HostName": "packages.edgedb.com", "ReplaceKeyPrefixWith": tgt_key, }, }) website_config: s3types.WebsiteConfigurationTypeDef = { "RoutingRules": existing_rrules, } if website.error_document is not None: website_config["ErrorDocument"] = cast( s3types.ErrorDocumentTypeDef, website.error_document, ) if website.index_document is not None: website_config["IndexDocument"] = cast( s3types.IndexDocumentTypeDef, website.index_document, ) if website.redirect_all_requests_to is not None: website_config["RedirectAllRequestsTo"] = cast( s3types.RedirectAllRequestsToTypeDef, website.redirect_all_requests_to, ) print("updating bucket website config:") pprint.pprint(website_config) website.put(WebsiteConfiguration=website_config)
def process_apt( cfg: Config, s3session: s3.S3ServiceResource, tf: tarfile.TarFile, metadata: dict[str, Any], temp_dir: pathlib.Path, local_dir: pathlib.Path, ) -> None: bucket = s3session.Bucket(BUCKET) changes = None incoming_dir = temp_dir / "incoming" incoming_dir.mkdir() reprepro_logs = temp_dir / "reprepro-logs" reprepro_logs.mkdir() reprepro_tmp = temp_dir / "reprepro-tmp" reprepro_tmp.mkdir() reprepro_conf = temp_dir / "reprepro-conf" reprepro_conf.mkdir() local_apt_dir = local_dir / "apt" local_apt_dir.mkdir(parents=True, exist_ok=True) index_dir = local_apt_dir / ".jsonindexes" index_dir.mkdir(exist_ok=True) with open(reprepro_conf / "incoming", "wt") as f: dists = " ".join(d["codename"] for d in cfg["apt"]["distributions"]) incoming = textwrap.dedent(f"""\ Name: default IncomingDir: {str(incoming_dir)} TempDir: {str(reprepro_tmp)} Allow: {dists} """) f.write(incoming) with open(reprepro_conf / "distributions", "wt") as f: distributions = generate_reprepro_distributions(cfg) f.write(distributions) for member in tf.getmembers(): if member.name in {".", "build-metadata.json"}: continue tf.extract(member, incoming_dir) fn = pathlib.Path(member.name) if fn.suffix == ".changes": if changes is not None: print("Multiple .changes files in apt tarball") return changes = fn for sub in [".jsonindexes", "db", "dists"]: sync_to_local( bucket, pathlib.Path("/apt") / sub, local_apt_dir / sub, exact_timestamps=True, ) sync_to_local( bucket, pathlib.Path("/apt") / "pool", local_apt_dir / "pool", ) subprocess.run( [ "reprepro", "-V", "-V", f"--confdir={str(reprepro_conf)}", f"--basedir={str(local_apt_dir)}", f"--logdir={str(reprepro_logs)}", "processincoming", "default", str(changes), ], cwd=incoming_dir, check=True, ) result = subprocess.run( [ "reprepro", f"--confdir={str(reprepro_conf)}", f"--basedir={str(local_apt_dir)}", f"--logdir={str(reprepro_logs)}", "dumpreferences", ], text=True, check=True, stdout=subprocess.PIPE, stderr=None, ) repo_dists = set() for line in result.stdout.split("\n"): if not line.strip(): continue dist, _, _ = line.partition("|") repo_dists.add(dist) list_format = (r"\0".join(( r"${$architecture}", r"${$component}", r"${package}", r"${version}", r"${$fullfilename}", r"${Installed-Size}", r"${Metapkg-Metadata}", )) + r"\n") existing: dict[str, dict[tuple[str, str], Package]] = {} packages: dict[str, dict[tuple[str, str], Package]] = {} for dist in repo_dists: result = subprocess.run( [ "reprepro", f"--confdir={str(reprepro_conf)}", f"--basedir={str(local_apt_dir)}", f"--logdir={str(reprepro_logs)}", f"--list-format={list_format}", "list", dist, ], text=True, check=True, stdout=subprocess.PIPE, stderr=None, ) for line in result.stdout.split("\n"): if not line.strip(): continue ( arch, component, pkgname, pkgver, pkgfile, size, pkgmetadata_json, ) = line.split("\0") if component != "main" and not dist.endswith(component): index_dist = f"{dist}.{component}" else: index_dist = dist prev_dist_packages = existing.get(index_dist) if prev_dist_packages is None: idxfile = index_dir / f"{index_dist}.json" prev_dist_packages = {} if idxfile.exists(): with open(idxfile, "r") as f: data = json.load(f) if isinstance(data, dict) and (pkglist := data.get("packages")): for pkg in pkglist: index_key = ( pkg["basename"], pkg["version_key"], ) prev_dist_packages[index_key] = Package(**pkg) existing[index_dist] = prev_dist_packages dist_packages = packages.get(index_dist) if dist_packages is None: packages[index_dist] = dist_packages = {} if arch == "amd64": arch = "x86_64" is_metapackage = int(size) < 20 relver, _, revver = pkgver.rpartition("-") m = slot_regexp.match(pkgname) if not m: print("cannot parse package name: {}".format(pkgname)) basename = pkgname slot = None else: basename = m.group(1) slot = m.group(2) if pkgmetadata_json: pkgmetadata = json.loads(pkgmetadata_json) if is_metapackage: pkgmetadata["name"] = basename parsed_ver = pkgmetadata["version_details"] else: parsed_ver = parse_version(relver) pkgmetadata = { "name": basename, "version": relver, "version_slot": slot, "version_details": parsed_ver, "architecture": arch, "revision": revver, } version_key = format_version_key(parsed_ver, revver) ver_metadata = pkgmetadata["version_details"]["metadata"] index_key = (pkgmetadata["name"], version_key) if index_key in prev_dist_packages: dist_packages[index_key] = prev_dist_packages[index_key] dist_packages[index_key]["architecture"] = arch else: if basename == "edgedb-server" and not ver_metadata.get( "catalog_version"): if not pathlib.Path(pkgfile).exists(): print(f"package file does not exist: {pkgfile}") else: catver = extract_catver_from_deb(pkgfile) if catver is None: print( f"cannot extract catalog version from {pkgfile}" ) else: ver_metadata["catalog_version"] = str(catver) print(f"extracted catver {catver} from {pkgfile}") installref = InstallRef( ref="{}={}-{}".format(pkgname, relver, revver), type=None, encoding=None, verification={}, ) append_artifact(dist_packages, pkgmetadata, installref) print("makeindex: noted {}".format(installref["ref"]))