def validate_support_config(cluster_name): """ Validates the provided non-encrypted helm chart values files for the support chart of a specific cluster. """ _prepare_helm_charts_dependencies_and_schemas() config_file_path = find_absolute_path_to_cluster_file(cluster_name) with open(config_file_path) as f: cluster = Cluster(yaml.load(f), config_file_path.parent) if cluster.support: print_colour( f"Validating non-encrypted support values files for {cluster_name}..." ) cmd = [ "helm", "template", str(helm_charts_dir.joinpath("support")), ] for values_file in cluster.support["helm_chart_values_files"]: cmd.append(f"--values={config_file_path.parent.joinpath(values_file)}") try: subprocess.check_output(cmd, text=True) except subprocess.CalledProcessError as e: print(e.stdout) sys.exit(1) else: print_colour(f"No support defined for {cluster_name}. Nothing to validate!")
def use_cluster_credentials(cluster_name): """ Quickly gain command-line access to a cluster by updating the current kubeconfig file to include the deployer's access credentials for the named cluster and mark it as the cluster to work against by default. This function is to be used with the `use-cluster-credentials` CLI command only - it is not used by the rest of the deployer codebase. """ validate_cluster_config(cluster_name) config_file_path = find_absolute_path_to_cluster_file(cluster_name) with open(config_file_path) as f: cluster = Cluster(yaml.load(f), config_file_path.parent) # Cluster.auth() method has the context manager decorator so cannot call # it like a normal function with cluster.auth(): # This command will spawn a new shell with all the env vars (including # KUBECONFIG) inherited, and once you quit that shell the python program # will resume as usual. # TODO: Figure out how to change the PS1 env var of the spawned shell # to change the prompt to f"cluster-{cluster.spec['name']}". This will # make it visually clear that the user is now operating in a different # shell. subprocess.check_call([os.environ["SHELL"], "-l"])
def validate_cluster_config(cluster_name): """ Validates cluster.yaml configuration against a JSONSchema. """ cluster_schema_file = Path(os.getcwd()).joinpath("shared", "deployer", "cluster.schema.yaml") cluster_file = find_absolute_path_to_cluster_file(cluster_name) with open(cluster_file) as cf, open(cluster_schema_file) as sf: cluster_config = yaml.load(cf) schema = yaml.load(sf) # Raises useful exception if validation fails jsonschema.validate(cluster_config, schema)
def exec_homes_shell(cluster_name, hub_name): """ Pop a shell with the home directories of the given hub mounted Homes will be mounter under /home """ config_file_path = find_absolute_path_to_cluster_file(cluster_name) with open(config_file_path) as f: cluster = Cluster(yaml.load(f), config_file_path.parent) with cluster.auth(): hubs = cluster.hubs hub = next((hub for hub in hubs if hub.spec["name"] == hub_name), None) hub.exec_homes_shell()
def deploy(cluster_name, hub_name, config_path, dask_gateway_version): """ Deploy one or more hubs in a given cluster """ validate_cluster_config(cluster_name) validate_hub_config(cluster_name, hub_name) assert_single_auth_method_enabled(cluster_name, hub_name) with get_decrypted_file(config_path) as decrypted_file_path: with open(decrypted_file_path) as f: config = yaml.load(f) # Most of our hubs use Auth0 for Authentication. This lets us programmatically # determine what auth provider each hub uses - GitHub, Google, etc. Without # this, we'd have to manually generate credentials for each hub - and we # don't want to do that. Auth0 domains are tied to a account, and # this is our auth0 domain for the paid account that 2i2c has. auth0 = config["auth0"] k = KeyProvider(auth0["domain"], auth0["client_id"], auth0["client_secret"]) # Each hub needs a unique proxy.secretToken. However, we don't want # to manually generate & save it. We also don't want it to change with # each deploy - that causes a pod restart with downtime. So instead, # we generate it based on a single secret key (`PROXY_SECRET_KEY`) # combined with the name of each hub. This way, we get unique, # cryptographically secure proxy.secretTokens without having to # keep much state. We can rotate them by changing `PROXY_SECRET_KEY`. # However, if `PROXY_SECRET_KEY` leaks, that means all the hub's # proxy.secretTokens have leaked. So let's be careful with that! SECRET_KEY = bytes.fromhex(config["secret_key"]) config_file_path = find_absolute_path_to_cluster_file(cluster_name) with open(config_file_path) as f: cluster = Cluster(yaml.load(f), config_file_path.parent) with cluster.auth(): hubs = cluster.hubs if hub_name: hub = next((hub for hub in hubs if hub.spec["name"] == hub_name), None) print_colour(f"Deploying hub {hub.spec['name']}...") hub.deploy(k, SECRET_KEY, dask_gateway_version) else: for i, hub in enumerate(hubs): print_colour( f"{i+1} / {len(hubs)}: Deploying hub {hub.spec['name']}..." ) hub.deploy(k, SECRET_KEY, dask_gateway_version)
def assert_single_auth_method_enabled(cluster_name, hub_name): """ For each hub of a specific cluster, it asserts that only a single auth method is enabled. An error is raised when an authenticator other than Auth0 is enabled and `auth0` is not explicitly disabled. """ _prepare_helm_charts_dependencies_and_schemas() config_file_path = find_absolute_path_to_cluster_file(cluster_name) with open(config_file_path) as f: cluster = Cluster(yaml.load(f), config_file_path.parent) hubs = [] if hub_name: hubs = [h for h in cluster.hubs if h.spec["name"] == hub_name] else: hubs = cluster.hubs for i, hub in enumerate(hubs): print_colour( f"{i+1} / {len(hubs)}: Validating authenticator config for {hub.spec['name']}..." ) authenticator_class = "auth0" for values_file_name in hub.spec["helm_chart_values_files"]: if "secret" not in os.path.basename(values_file_name): values_file = config_file_path.parent.joinpath( values_file_name) # Load the hub extra config from its specific values files config = yaml.load(values_file) # Check if there's config that specifies an authenticator class try: if hub.spec["helm_chart"] != "basehub": authenticator_class = config["basehub"]["jupyterhub"][ "hub"]["config"]["JupyterHub"][ "authenticator_class"] else: authenticator_class = config["jupyterhub"]["hub"][ "config"]["JupyterHub"]["authenticator_class"] except KeyError: pass # If the authenticator class is other than auth0, then raise an error # if auth0 is not explicitly disabled from the cluster config if authenticator_class != "auth0" and hub.spec["auth0"].get( "enabled", True): raise ValueError( f"Please disable auth0 for {hub.spec['name']} hub before using another authenticator class!" )
def get_central_grafana_token(cluster_name): """Returns the access token of the Grafana located in `cluster_name` cluster. This access token should have enough permissions to create datasources. """ # Get the location of the file that stores the central grafana token cluster_config_dir_path = find_absolute_path_to_cluster_file( cluster_name).parent grafana_token_file = ( cluster_config_dir_path).joinpath("enc-grafana-token.secret.yaml") # Read the secret grafana token file with get_decrypted_file(grafana_token_file) as decrypted_file_path: with open(decrypted_file_path) as f: config = yaml.load(f) return config["grafana_token"]
def deploy_support(cluster_name, cert_manager_version): """Deploy support components to a cluster Args: cluster_name (str): The name of the cluster to deploy support components to cert_manager_version (str): The version of cert-manager to deploy to the cluster, in the form vX.Y.Z. where X.Y.Z is valid SemVer. """ validate_cluster_config(cluster_name) validate_support_config(cluster_name) config_file_path = find_absolute_path_to_cluster_file(cluster_name) with open(config_file_path) as f: cluster = Cluster(yaml.load(f), config_file_path.parent) if cluster.support: with cluster.auth(): cluster.deploy_support(cert_manager_version=cert_manager_version)
def get_central_grafana_url(central_cluster_name): cluster_config_dir_path = find_absolute_path_to_cluster_file( central_cluster_name).parent config_file = cluster_config_dir_path.joinpath("support.values.yaml") with open(config_file) as f: support_config = yaml.load(f) grafana_tls_config = (support_config.get("grafana", {}).get("ingress", {}).get("tls", [])) if not grafana_tls_config: raise ValueError( f"No tls config was found for the Grafana instance of {central_cluster_name}. Please consider enable it before using it as the central Grafana." ) # We only have one tls host right now. Modify this when things change. return grafana_tls_config[0]["hosts"][0]
def validate_hub_config(cluster_name, hub_name): """ Validates the provided non-encrypted helm chart values files for each hub of a specific cluster. """ _prepare_helm_charts_dependencies_and_schemas() config_file_path = find_absolute_path_to_cluster_file(cluster_name) with open(config_file_path) as f: cluster = Cluster(yaml.load(f), config_file_path.parent) hubs = [] if hub_name: hubs = [h for h in cluster.hubs if h.spec["name"] == hub_name] else: hubs = cluster.hubs for i, hub in enumerate(hubs): print_colour( f"{i+1} / {len(hubs)}: Validating non-encrypted hub values files for {hub.spec['name']}..." ) cmd = [ "helm", "template", str(helm_charts_dir.joinpath(hub.spec["helm_chart"])), ] for values_file in hub.spec["helm_chart_values_files"]: if "secret" not in os.path.basename(values_file): cmd.append( f"--values={config_file_path.parent.joinpath(values_file)}" ) # Workaround the current requirement for dask-gateway 0.9.0 to have a # JupyterHub api-token specified, for updates if this workaround can be # removed, see https://github.com/dask/dask-gateway/issues/473. if hub.spec["helm_chart"] in ("daskhub", "binderhub"): cmd.append( "--set=dask-gateway.gateway.auth.jupyterhub.apiToken=dummy") try: subprocess.check_output(cmd, text=True) except subprocess.CalledProcessError as e: print(e.stdout) sys.exit(1)
def get_cluster_prometheus_creds(cluster_name): """Retrieves the credentials of the prometheus instance running on the `cluster_name` cluster. These credentials are stored in `enc-support.secret.values.yaml` file of each cluster config directory. Args: cluster_name: name of the cluster Returns: dict object: {username: `username`, password: `password`} """ cluster_config_dir_path = find_absolute_path_to_cluster_file( cluster_name).parent config_filename = cluster_config_dir_path.joinpath( "enc-support.secret.values.yaml") with get_decrypted_file(config_filename) as decrypted_path: with open(decrypted_path) as f: prometheus_config = yaml.load(f) return prometheus_config.get("prometheusIngressAuthSecret", {})
def get_cluster_prometheus_address(cluster_name): """Retrieves the address of the prometheus instance running on the `cluster_name` cluster. This address is stored in the `support.values.yaml` file of each cluster config directory. Args: cluster_name: name of the cluster Returns: string object: https address of the prometheus instance Raises ValueError if - `prometheusIngressAuthSecret` isn't configured - `support["prometheus"]["server"]["ingress"]["tls"]` doesn't exist """ cluster_config_dir_path = find_absolute_path_to_cluster_file( cluster_name).parent config_file = cluster_config_dir_path.joinpath("support.values.yaml") with open(config_file) as f: support_config = yaml.load(f) # Don't return the address if the prometheus instance wasn't securely exposed to the outside. if not support_config.get("prometheusIngressAuthSecret", {}).get( "enabled", False): raise ValueError( f"`prometheusIngressAuthSecret` wasn't configured for {cluster_name}" ) tls_config = (support_config.get("prometheus", {}).get("server", {}).get("ingress", {}).get("tls", [])) if not tls_config: raise ValueError( f"No tls config was found for the prometheus instance of {cluster_name}" ) # We only have one tls host right now. Modify this when things change. return tls_config[0]["hosts"][0]
def deploy_grafana_dashboards(cluster_name): """ Deploy grafana dashboards to a cluster that provide useful metrics for operating a JupyterHub Grafana dashboards and deployment mechanism in question are maintained in this repo: https://github.com/jupyterhub/grafana-dashboards """ validate_cluster_config(cluster_name) validate_support_config(cluster_name) config_file_path = find_absolute_path_to_cluster_file(cluster_name) with open(config_file_path) as f: cluster = Cluster(yaml.load(f), config_file_path.parent) # If grafana support chart is not deployed, then there's nothing to do if not cluster.support: print_colour( "Support chart has not been deployed. Skipping Grafana dashboards deployment..." ) return grafana_token_file = ( config_file_path.parent).joinpath("enc-grafana-token.secret.yaml") # Read the cluster specific secret grafana token file with get_decrypted_file(grafana_token_file) as decrypted_file_path: with open(decrypted_file_path) as f: config = yaml.load(f) # Check GRAFANA_TOKEN exists in the secret config file before continuing if "grafana_token" not in config.keys(): raise ValueError( f"`grafana_token` not provided in secret file! Please add it and try again: {grafana_token_file}" ) # FIXME: We assume grafana_url and uses_tls config will be defined in the first # file listed under support.helm_chart_values_files. support_values_file = cluster.support.get("helm_chart_values_files", [])[0] with open(config_file_path.parent.joinpath(support_values_file)) as f: support_values_config = yaml.load(f) # Get the url where grafana is running from the support values file grafana_url = (support_values_config.get("grafana", {}).get("ingress", {}).get("hosts", {})) uses_tls = (support_values_config.get("grafana", {}).get("ingress", {}).get("tls", {})) if not grafana_url: print_colour( "Couldn't find `config.grafana.ingress.hosts`. Skipping Grafana dashboards deployment..." ) return grafana_url = (f"https://{grafana_url[0]}" if uses_tls else f"http://{grafana_url[0]}") # Use the jupyterhub/grafana-dashboards deployer to deploy the dashboards to this cluster's grafana print_colour("Cloning jupyterhub/grafana-dashboards...") dashboards_dir = "grafana_dashboards" subprocess.check_call([ "git", "clone", "https://github.com/jupyterhub/grafana-dashboards", dashboards_dir, ]) # We need the existing env too for the deployer to be able to find jssonnet and grafonnet deploy_env = os.environ.copy() deploy_env.update({"GRAFANA_TOKEN": config["grafana_token"]}) try: print_colour(f"Deploying grafana dashboards to {cluster_name}...") subprocess.check_call(["./deploy.py", grafana_url], env=deploy_env, cwd=dashboards_dir) print_colour(f"Done! Dashboards deployed to {grafana_url}.") finally: # Delete the directory where we cloned the repo. # The deployer cannot call jsonnet to deploy the dashboards if using a temp directory here. # Might be because opening more than once of a temp file is tried # (https://docs.python.org/3.8/library/tempfile.html#tempfile.NamedTemporaryFile) shutil.rmtree(dashboards_dir)
def run_hub_health_check(cluster_name, hub_name, check_dask_scaling=False): """Run a health check on a given hub on a given cluster. Optionally check scaling of dask workers if the hub is a daskhub. Args: cluster_name (str): The name of the cluster where the hub is deployed hub_name (str): The name of the hub to run a health check for check_dask_scaling (bool, optional): If true, run an additional check that dask workers can scale. Only applies to daskhubs. Defaults to False. Returns exit_code (int): The exit code of the pytest process. 0 for pass, any other integer number greater than 0 for failure. """ # Read in the cluster.yaml file config_file_path = find_absolute_path_to_cluster_file(cluster_name) with open(config_file_path) as f: cluster = Cluster(yaml.load(f), config_file_path.parent) # Find the hub's config hub_indx = [ indx for (indx, h) in enumerate(cluster.hubs) if h.spec["name"] == hub_name ] if len(hub_indx) == 1: hub = cluster.hubs[hub_indx[0]] elif len(hub_indx) > 1: print_colour("ERROR: More than one hub with this name found!") sys.exit(1) elif len(hub_indx) == 0: print_colour("ERROR: No hubs with this name found!") sys.exit(1) print_colour(f"Running hub health check for {hub.spec['name']}...") # Check if this hub has a domain override file. If yes, apply override. if "domain_override_file" in hub.spec.keys(): domain_override_file = hub.spec["domain_override_file"] with get_decrypted_file( hub.cluster.config_path.joinpath( domain_override_file)) as decrypted_path: with open(decrypted_path) as f: domain_override_config = yaml.load(f) hub.spec["domain"] = domain_override_config["domain"] # Retrieve hub's URL hub_url = f'https://{hub.spec["domain"]}' # Read in the service api token from a k8s Secret in the k8s cluster with cluster.auth(): try: service_api_token_b64encoded = subprocess.check_output( [ "kubectl", "get", "secrets", "hub", f"--namespace={hub.spec['name']}", r"--output=jsonpath={.data['hub\.services\.hub-health\.apiToken']}", ], text=True, ) except subprocess.CalledProcessError as e: raise ValueError( f"Failed to acquire a JupyterHub API token for the hub-health service: {e.stdout}" ) service_api_token = base64.b64decode( service_api_token_b64encoded).decode() # On failure, pytest prints out params to the test that failed. # This can contain sensitive info - so we hide stderr # FIXME: Don't use pytest - just call a function instead # # Show errors locally but redirect on CI gh_ci = os.environ.get("CI", "false") pytest_args = [ "-q", "deployer/tests", f"--hub-url={hub_url}", f"--api-token={service_api_token}", f"--hub-type={hub.spec['helm_chart']}", ] if (hub.spec["helm_chart"] == "daskhub") and check_dask_scaling: pytest_args.append("--check-dask-scaling") if gh_ci == "true": print_colour("Testing on CI, not printing output") with open(os.devnull, "w") as dn, redirect_stderr(dn), redirect_stdout(dn): exit_code = pytest.main(pytest_args) else: print_colour("Testing locally, do not redirect output") exit_code = pytest.main(pytest_args) if exit_code != 0: print("Health check failed!", file=sys.stderr) sys.exit(exit_code) else: print_colour("Health check succeeded!") return exit_code
def _build_config_filename(self, cluster_name, hub_name): cluster_config_dir_path = find_absolute_path_to_cluster_file( cluster_name ).parent return cluster_config_dir_path.joinpath(f"enc-{hub_name}.secret.values.yaml")