def load_manifest_from_file(filename: str, type: Type[T]) -> T: _logger.debug("Loading manifest file (%s)", filename) filepath = os.path.abspath(filename) _logger.debug("filepath: %s", filepath) filedir: str = os.path.dirname(filepath) utils.print_dir(dir=filedir) YamlIncludeConstructor.add_to_loader_class( loader_class=yaml.SafeLoader, base_dir=filedir) _add_ssm_param_injector() _add_env_var_injector() with open(filepath, "r") as f: raw: Dict[str, Any] = cast(Dict[str, Any], yaml.safe_load(f)) _logger.debug("raw: %s", raw) if type is Manifest: raw["SsmParameterName"] = f"/orbit/{raw['Name']}/manifest" manifest: T = cast( T, Manifest.Schema().load(data=raw, many=False, partial=False, unknown=EXCLUDE)) elif type is FoundationManifest: raw["SsmParameterName"] = f"/orbit-f/{raw['Name']}/manifest" manifest = cast( T, FoundationManifest.Schema().load(data=raw, many=False, partial=False, unknown=EXCLUDE)) else: raise ValueError("Unknown 'manifest' Type") ManifestSerDe.dump_manifest_to_ssm(manifest=manifest) return manifest
def destroy_kubeflow(context: Context) -> None: stack_name: str = f"orbit-{context.name}" final_eks_stack_name: str = f"eksctl-{stack_name}-cluster" _logger.debug("EKSCTL stack name: %s", final_eks_stack_name) if cfn.does_stack_exist(stack_name=final_eks_stack_name): kubectl.write_kubeconfig(context=context) for line in sh.run_iterating("kubectl get namespace kubeflow"): if '"kubeflow" not found' in line: return cluster_name = f"orbit-{context.name}" output_path = os.path.join(".orbit.out", context.name, "kubeflow", cluster_name) gen_kubeflow_config(context, output_path, cluster_name) _logger.debug("Destroying Kubeflow") output_path = os.path.abspath(output_path) _logger.debug(f"kubeflow config dir: {output_path}") utils.print_dir(output_path) timeouts = 0 while timeouts < 3: try: _logger.info("Deleting kubeflow resources") sh.run("./delete_kf.sh", cwd=output_path) except FailedShellCommand: _logger.info( "The command returned a non-zero exit code. Retrying to delete resources" ) timeouts += 1 time.sleep(300)
def update_docker_file(context: "Context", dir: str) -> None: _logger.debug("Docker directory before building: %s", os.path.abspath(dir)) utils.print_dir(dir) docker_file = os.path.join(dir, "Dockerfile") if os.path.exists(docker_file): _logger.info("Building DockerFile %s", docker_file) tag = context.images.jupyter_user.version jupyter_user_base = ( f"{context.account_id}.dkr.ecr.{context.region}.amazonaws.com/orbit-{context.name}/jupyter-user:{tag}" if context.images.jupyter_user.get_source( account_id=context.account_id, region=context.region) == "code" else f"{context.images.jupyter_user.repository}:{tag}") with open(docker_file, "r") as file: content: str = file.read() content = utils.resolve_parameters( content, dict( region=context.region, account=context.account_id, env=context.name, jupyter_user_base=jupyter_user_base, ), ) with open(docker_file, "w") as file: file.write(content)
def remote_cli(command: str, args: Tuple[str]) -> None: """Run command remotely on CodeBuild""" enable_debug(format=DEBUG_LOGGING_FORMAT_REMOTE) from aws_orbit.remote_files import REMOTE_FUNC_TYPE, RemoteCommands _logger.debug("Remote bundle structure:") print_dir(os.getcwd(), exclude=["__pycache__", "cdk", ".venv", ".mypy_cache"]) remote_func: REMOTE_FUNC_TYPE = getattr(RemoteCommands, command) remote_func(args)
def deploy( plugin_id: str, context: "Context", team_context: "TeamContext", parameters: Dict[str, Any], ) -> None: _logger.debug("Team Env name: %s | Team name: %s", context.name, team_context.name) plugin_id = plugin_id.replace("_", "-") _logger.debug("plugin_id: %s", plugin_id) release_name = f"{team_context.name}-{plugin_id}" _logger.info("Checking Chart %s is installed...", release_name) fresh_install = True if helm.is_exists_chart_release(release_name, team_context.name): _logger.info( "Chart %s already installed, removing to begin new install", release_name) vars: Dict[str, Optional[str]] = dict( team=team_context.name, region=context.region, account_id=context.account_id, env_name=context.name, plugin_id=plugin_id, ) chart_path = helm.create_team_charts_copy(team_context=team_context, path=CHART_PATHS, target_path=plugin_id) _logger.debug("package dir") utils.print_dir(CHART_PATHS) _logger.debug("copy chart dir") utils.print_dir(chart_path) repo_location = team_context.team_helm_repository repo = team_context.name helm.add_repo(repo=repo, repo_location=repo_location) chart_name, chart_version, chart_package = helm.package_chart( repo=repo, chart_path=chart_path, values=vars) _logger.info("Chart %s installing ", release_name) helm.install_chart( repo=repo, namespace=team_context.name, name=release_name, chart_name=chart_name, chart_version=chart_version, ) chart_name, chart_version, chart_package = helm.package_chart( repo=repo, chart_path=chart_path, values=vars) _logger.info( f"Sagemaker Operator Helm Chart {chart_name}@{chart_version} installed for {team_context.name} at {chart_package}" )
def deploy(plugin_id: str, context: "Context", team_context: "TeamContext", parameters: Dict[str, Any]) -> None: _logger.debug("Team Env name: %s | Team name: %s", context.name, team_context.name) plugin_id = plugin_id.replace("_", "-") _logger.debug("plugin_id: %s", plugin_id) chart_path = helm.create_team_charts_copy(team_context=team_context, path=CHART_PATH) _logger.debug("package dir") utils.print_dir(CHART_PATH) _logger.debug("copy chart dir") utils.print_dir(chart_path) vars: Dict[str, Optional[str]] = dict( team=team_context.name, region=context.region, account_id=context.account_id, env_name=context.name, tag=context.images.jupyter_hub.version, restart_policy=parameters["restartPolicy"] if "restartPolicy" in parameters else "Never", plugin_id=plugin_id, toolkit_s3_bucket=context.toolkit.s3_bucket, image_pull_policy="Always" if aws_orbit.__version__.endswith(".dev0") else "IfNotPresent", ) if "script" in parameters: script_body = parameters["script"] else: raise Exception(f"Plugin {plugin_id} must define parameter 'script'") script_file = os.path.join(chart_path, "script.txt") script_body = utils.resolve_parameters(script_body, vars) with open(script_file, "w") as file: file.write(script_body) repo_location = helm.init_team_repo(context=context, team_context=team_context) repo = team_context.name _logger.debug(script_body) helm.add_repo(repo=repo, repo_location=repo_location) chart_name, chart_version, chart_package = helm.package_chart( repo=repo, chart_path=chart_path, values=vars) helm.install_chart( repo=repo, namespace=team_context.name, name=f"{team_context.name}-{plugin_id}", chart_name=chart_name, chart_version=chart_version, )
def destroy_teams(context: "Context") -> None: eks_stack_name: str = f"eksctl-orbit-{context.name}-cluster" _logger.debug("EKSCTL stack name: %s", eks_stack_name) if cfn.does_stack_exist(stack_name=eks_stack_name): sh.run( f"eksctl utils write-kubeconfig --cluster orbit-{context.name} --set-kubeconfig-context" ) k8s_context = get_k8s_context(context=context) _logger.debug("kubectl k8s_context: %s", k8s_context) _logger.debug("Attempting kubectl delete") output_path = _generate_teams_manifest(context=context) utils.print_dir(dir=output_path) try: sh.run(f"kubectl delete -f {output_path} --grace-period=0 --force " f"--ignore-not-found --wait=false --context {k8s_context}") except exceptions.FailedShellCommand as ex: _logger.debug("Skipping: %s", ex) pass # Let's leave for eksctl, it will destroy everything anyway...
def deploy_kubeflow(context: Context) -> None: stack_name: str = f"orbit-{context.name}" final_eks_stack_name: str = f"eksctl-{stack_name}-cluster" _logger.debug("EKSCTL stack name: %s", final_eks_stack_name) if cfn.does_stack_exist(stack_name=final_eks_stack_name): cluster_name = f"orbit-{context.name}" output_path = os.path.join(".orbit.out", context.name, "kubeflow", cluster_name) gen_kubeflow_config(context, output_path, cluster_name) _logger.debug("Deploying Kubeflow") output_path = os.path.abspath(output_path) _logger.debug(f"kubeflow config dir: {output_path}") utils.print_dir(output_path) sh.run("./apply_kf.sh", cwd=output_path) time.sleep(120) sh.run("kubectl delete ds -n kubeflow nvidia-device-plugin-daemonset")
def update_docker_file(account_id: str, region: str, env: str, tag: str, dir: str) -> None: _logger.debug("Docker directory before building: %s", os.path.abspath(dir)) utils.print_dir(dir) docker_file = os.path.join(dir, "Dockerfile") if os.path.exists(docker_file): _logger.info("Building DockerFile %s", docker_file) jupyter_user_base = f"{account_id}.dkr.ecr.{region}.amazonaws.com/orbit-{env}/jupyter-user:{tag}" _logger.debug( f"update_docker_file: jupyter_user_base = {jupyter_user_base}") with open(docker_file, "r") as file: content: str = file.read() content = utils.resolve_parameters( content, dict( region=region, account=account_id, env=env, jupyter_user_base=jupyter_user_base, ), ) with open(docker_file, "w") as file: file.write(content)
def deploy( plugin_id: str, context: "Context", team_context: "TeamContext", parameters: Dict[str, Any], ) -> None: _logger.debug("Team Env name: %s | Team name: %s", context.name, team_context.name) plugin_id = plugin_id.replace("_", "-") _logger.debug("plugin_id: %s", plugin_id) release_name = f"{team_context.name}-{plugin_id}" _logger.info("Checking Chart %s is installed...", release_name) fs_name = f"lustre-{team_context.name}-fs-{plugin_id}" vars: Dict[str, Optional[str]] = dict( team=team_context.name, region=context.region, account_id=context.account_id, env_name=context.name, plugin_id=plugin_id, deploymentType="SCRATCH_2", sg=team_context.team_security_group_id, subnet=context.networking.data.nodes_subnets[0], s3importpath= f"s3://{team_context.scratch_bucket}/{team_context.name}/lustre", s3exportpath= f"s3://{team_context.scratch_bucket}/{team_context.name}/lustre", storage=parameters["storage"] if "storage" in parameters else "1200Gi", folder=parameters["folder"] if "folder" in parameters else "data", k8s_utilities_image= f"{context.images.k8s_utilities.repository}:{context.images.k8s_utilities.version}", fs_name=fs_name, ) if not helm.is_exists_chart_release(release_name, team_context.name): _logger.info("Chart %s already installed, skipping installation", release_name) ec2.authorize_security_group_ingress( group_id=cast(str, team_context.team_security_group_id), ip_permissions=[ IpPermission( from_port=988, to_port=988, ip_protocol="tcp", user_id_group_pairs=[ UserIdGroupPair( description="All from Cluster", group_id=cast(str, context.cluster_sg_id), ) ], ) ], ) chart_path = helm.create_team_charts_copy(team_context=team_context, path=TEAM_CHARTS_PATH, target_path=plugin_id) _logger.debug("package dir") utils.print_dir(TEAM_CHARTS_PATH) _logger.debug("copy chart dir") utils.print_dir(chart_path) if not team_context.team_helm_repository: raise Exception("Missing team helm repository") repo_location = team_context.team_helm_repository repo = team_context.name helm.add_repo(repo=repo, repo_location=repo_location) chart_name, chart_version, chart_package = helm.package_chart( repo=repo, chart_path=os.path.join(chart_path, "fsx-storageclass"), values=vars) helm.install_chart_no_upgrade( repo=repo, namespace=team_context.name, name=release_name, chart_name=chart_name, chart_version=chart_version, ) get_user_pv(fs_name, plugin_id, context, team_context, vars) # install this package at the user helm repository such that its installed on every user space chart_path = helm.create_team_charts_copy(team_context=team_context, path=USER_CHARTS_PATH, target_path=plugin_id) if not team_context.user_helm_repository: raise Exception("Missing user helm repository") user_location = team_context.user_helm_repository user_repo = team_context.name + "--user" helm.add_repo(repo=user_repo, repo_location=user_location) chart_name, chart_version, chart_package = helm.package_chart( repo=user_repo, chart_path=os.path.join(chart_path, "fsx-filesystem"), values=vars) _logger.info( f"Lustre Helm Chart {chart_name}@{chart_version} installed for {team_context.name} at {chart_package}" )
def deploy(plugin_id: str, context: "Context", team_context: "TeamContext", parameters: Dict[str, Any]) -> None: _logger.debug("Team Env name: %s | Team name: %s", context.name, team_context.name) plugin_id = plugin_id.replace("_", "-") _logger.debug("plugin_id: %s", plugin_id) release_name = f"{team_context.name}-{plugin_id}" _logger.info("Checking Chart %s is installed...", release_name) if helm.is_exists_chart_release(release_name, team_context.name): _logger.info("Chart %s already installed, skipping installation", release_name) return try: sh.run( f"kubectl delete sc fsx-lustre-{team_context.name}-fast-fs-lustre") except Exception as e: _logger.error( f"Deleting prior sc 'fsx-lustre-{team_context.name}-fast-fs-lustre' failed with:%s", str(e)) vars: Dict[str, Optional[str]] = dict( team=team_context.name, region=context.region, account_id=context.account_id, env_name=context.name, plugin_id=plugin_id, deploymentType="SCRATCH_2", sg=team_context.team_security_group_id, subnet=context.networking.data.nodes_subnets[0], s3importpath= f"s3://{team_context.scratch_bucket}/{team_context.name}/lustre", s3exportpath= f"s3://{team_context.scratch_bucket}/{team_context.name}/lustre", ) ec2.authorize_security_group_ingress( group_id=cast(str, team_context.team_security_group_id), ip_permissions=[ IpPermission( from_port=988, to_port=988, ip_protocol="tcp", user_id_group_pairs=[ UserIdGroupPair(description="All from Cluster", group_id=cast(str, context.cluster_sg_id)) ], ) ], ) chart_path = helm.create_team_charts_copy(team_context=team_context, path=CHARTS_PATH) _logger.debug("package dir") utils.print_dir(CHARTS_PATH) _logger.debug("copy chart dir") utils.print_dir(chart_path) repo_location = helm.init_team_repo(context=context, team_context=team_context) repo = team_context.name helm.add_repo(repo=repo, repo_location=repo_location) chart_name, chart_version, chart_package = helm.package_chart( repo=repo, chart_path=os.path.join(chart_path, "fsx_storageclass"), values=vars) helm.install_chart_no_upgrade( repo=repo, namespace=team_context.name, name=release_name, chart_name=chart_name, chart_version=chart_version, ) chart_name, chart_version, chart_package = helm.package_chart( repo=repo, chart_path=os.path.join(chart_path, "fsx_filesystem"), values=vars) _logger.info( f"Lustre Helm Chart {chart_name}@{chart_version} installed for {team_context.name} at {chart_package}" )
def helm_package(plugin_id: str, context: "Context", team_context: "TeamContext", parameters: Dict[str, Any]) -> Tuple[str, str, str]: chart_path = helm.create_team_charts_copy(team_context=team_context, path=CHART_PATH, target_path=plugin_id) _logger.debug("copy chart dir") utils.print_dir(chart_path) if "image" not in parameters: image = f"{context.images.jupyter_user.repository}:{context.images.jupyter_user.version}" elif "aws-orbit-workbench/utility-data" in parameters["image"]: image = f"{context.images.utility_data.repository}:{context.images.utility_data.version}" else: image = parameters["image"] _logger.debug(f"For plugin {plugin_id} using image: {image}") vars: Dict[str, Optional[str]] = dict( team=team_context.name, region=context.region, account_id=context.account_id, env_name=context.name, tag=parameters["tag"] if "tag" in parameters else context.images.jupyter_user.version, restart_policy=parameters["restartPolicy"] if "restartPolicy" in parameters else "Never", plugin_id=plugin_id, toolkit_s3_bucket=context.toolkit.s3_bucket, image_pull_policy="Always" if aws_orbit.__version__.endswith(".dev0") else "IfNotPresent", image=image, uid=parameters["uid"] if "uid" in parameters else "1000", gid=parameters["gid"] if "gid" in parameters else "100", ) if "script" in parameters: script_body = parameters["script"] else: raise Exception(f"Plugin {plugin_id} must define parameter 'script'") script_file = os.path.join(chart_path, "team-script-launcher", "script.txt") script_body = utils.resolve_parameters(script_body, vars) with open(script_file, "w") as file: file.write(script_body) if not team_context.team_helm_repository: raise Exception("Missing team helm repository") repo_location = team_context.team_helm_repository repo = team_context.name _logger.debug(script_body) _init_team_repo(context=context, team_context=team_context, repo_location=repo_location) helm.add_repo(repo=repo, repo_location=repo_location) chart_name, chart_version, chart_package = helm.package_chart( repo=repo, chart_path=os.path.join(chart_path, "team-script-launcher"), values=vars) return (chart_name, chart_version, chart_package)